diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/utils/adt/Makefile | 3 | ||||
-rw-r--r-- | src/backend/utils/adt/rangetypes.c | 17 | ||||
-rw-r--r-- | src/backend/utils/adt/rangetypes_selfuncs.c | 599 | ||||
-rw-r--r-- | src/backend/utils/adt/rangetypes_typanalyze.c | 232 | ||||
-rw-r--r-- | src/include/catalog/catversion.h | 2 | ||||
-rw-r--r-- | src/include/catalog/pg_operator.h | 39 | ||||
-rw-r--r-- | src/include/catalog/pg_proc.h | 2 | ||||
-rw-r--r-- | src/include/catalog/pg_statistic.h | 18 | ||||
-rw-r--r-- | src/include/utils/rangetypes.h | 1 |
9 files changed, 881 insertions, 32 deletions
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index a692086bdfd..a929f4ae197 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -30,7 +30,8 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \ tsginidx.o tsgistidx.o tsquery.o tsquery_cleanup.o tsquery_gist.o \ tsquery_op.o tsquery_rewrite.o tsquery_util.o tsrank.o \ tsvector.o tsvector_op.o tsvector_parser.o \ - txid.o uuid.o windowfuncs.o xml.o rangetypes_spgist.o + txid.o uuid.o windowfuncs.o xml.o rangetypes_spgist.o \ + rangetypes_typanalyze.o rangetypes_selfuncs.o like.o: like.c like_match.c diff --git a/src/backend/utils/adt/rangetypes.c b/src/backend/utils/adt/rangetypes.c index fe9e0c4bc6a..f229a9d7b91 100644 --- a/src/backend/utils/adt/rangetypes.c +++ b/src/backend/utils/adt/rangetypes.c @@ -1228,23 +1228,6 @@ hash_range(PG_FUNCTION_ARGS) PG_RETURN_INT32(result); } -/* ANALYZE support */ - -/* typanalyze function for range datatypes */ -Datum -range_typanalyze(PG_FUNCTION_ARGS) -{ - /* - * For the moment, just punt and don't analyze range columns. If we get - * close to release without having a better answer, we could consider - * letting std_typanalyze do what it can ... but those stats are probably - * next door to useless for most activity with range columns, so it's not - * clear it's worth gathering them. - */ - PG_RETURN_BOOL(false); -} - - /* *---------------------------------------------------------- * CANONICAL FUNCTIONS diff --git a/src/backend/utils/adt/rangetypes_selfuncs.c b/src/backend/utils/adt/rangetypes_selfuncs.c new file mode 100644 index 00000000000..ab2c6d69644 --- /dev/null +++ b/src/backend/utils/adt/rangetypes_selfuncs.c @@ -0,0 +1,599 @@ +/*------------------------------------------------------------------------- + * + * rangetypes_selfuncs.c + * Functions for selectivity estimation of range operators + * + * Estimates are based on histograms of lower and upper bounds, and the + * fraction of empty ranges. + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/utils/adt/rangetypes_selfuncs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/pg_operator.h" +#include "catalog/pg_statistic.h" +#include "utils/lsyscache.h" +#include "utils/rangetypes.h" +#include "utils/selfuncs.h" +#include "utils/typcache.h" + +static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata, + RangeType *constval, Oid operator); +static double default_range_selectivity(Oid operator); +static double calc_hist_selectivity(TypeCacheEntry *typcache, + VariableStatData *vardata, RangeType *constval, + Oid operator); +static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache, + RangeBound *constbound, + RangeBound *hist, int hist_nvalues, + bool equal); +static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, + RangeBound *hist, int hist_length, bool equal); +static float8 get_position(TypeCacheEntry *typcache, RangeBound *value, + RangeBound *hist1, RangeBound *hist2); + +/* + * Returns a default selectivity estimate for given operator, when we don't + * have statistics or cannot use them for some reason. + */ +static double +default_range_selectivity(Oid operator) +{ + switch (operator) + { + case OID_RANGE_OVERLAP_OP: + return 0.01; + + case OID_RANGE_CONTAINS_OP: + case OID_RANGE_CONTAINED_OP: + return 0.005; + + case OID_RANGE_CONTAINS_ELEM_OP: + /* + * "range @> elem" is more or less identical to a scalar + * inequality "A >= b AND A <= c". + */ + return DEFAULT_RANGE_INEQ_SEL; + + case OID_RANGE_LESS_OP: + case OID_RANGE_LESS_EQUAL_OP: + case OID_RANGE_GREATER_OP: + case OID_RANGE_GREATER_EQUAL_OP: + case OID_RANGE_LEFT_OP: + case OID_RANGE_RIGHT_OP: + /* these are similar to regular scalar inequalities */ + return DEFAULT_INEQ_SEL; + + default: + /* all range operators should be handled above, but just in case */ + return 0.01; + } +} + +/* + * rangesel -- restriction selectivity for range operators + */ +Datum +rangesel(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + VariableStatData vardata; + Node *other; + bool varonleft; + Selectivity selec; + TypeCacheEntry *typcache; + RangeType *constrange = NULL; + + /* + * If expression is not (variable op something) or (something op + * variable), then punt and return a default estimate. + */ + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + PG_RETURN_FLOAT8(default_range_selectivity(operator)); + + /* + * Can't do anything useful if the something is not a constant, either. + */ + if (!IsA(other, Const)) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(default_range_selectivity(operator)); + } + + /* + * All the range operators are strict, so we can cope with a NULL constant + * right away. + */ + if (((Const *) other)->constisnull) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(0.0); + } + + /* + * If var is on the right, commute the operator, so that we can assume the + * var is on the left in what follows. + */ + if (!varonleft) + { + /* we have other Op var, commute to make var Op other */ + operator = get_commutator(operator); + if (!operator) + { + /* Use default selectivity (should we raise an error instead?) */ + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(default_range_selectivity(operator)); + } + } + + typcache = range_get_typcache(fcinfo, vardata.vartype); + + /* + * OK, there's a Var and a Const we're dealing with here. We need the + * Const to be of same range type as the column, else we can't do anything + * useful. (Such cases will likely fail at runtime, but here we'd rather + * just return a default estimate.) + * + * If the operator is "range @> element", the constant should be of the + * element type of the range column. Convert it to a range that includes + * only that single point, so that we don't need special handling for + * that in what follows. + */ + if (operator == OID_RANGE_CONTAINS_ELEM_OP) + { + if (((Const *) other)->consttype == typcache->rngelemtype->type_id) + { + RangeBound lower, upper; + lower.inclusive = true; + lower.val = ((Const *) other)->constvalue; + lower.infinite = false; + lower.lower = true; + upper.inclusive = true; + upper.val = ((Const *) other)->constvalue; + upper.infinite = false; + upper.lower = false; + constrange = range_serialize(typcache, &lower, &upper, false); + } + } + else + { + if (((Const *) other)->consttype == vardata.vartype) + constrange = DatumGetRangeType(((Const *) other)->constvalue); + } + + /* + * If we got a valid constant on one side of the operator, proceed to + * estimate using statistics. Otherwise punt and return a default + * constant estimate. + */ + if (constrange) + selec = calc_rangesel(typcache, &vardata, constrange, operator); + else + selec = default_range_selectivity(operator); + + ReleaseVariableStats(vardata); + + CLAMP_PROBABILITY(selec); + + PG_RETURN_FLOAT8((float8) selec); +} + +static double +calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata, + RangeType *constval, Oid operator) +{ + double hist_selec; + double selec; + float4 empty_frac, null_frac; + + /* + * First look up the fraction of NULLs and empty ranges from pg_statistic. + */ + if (HeapTupleIsValid(vardata->statsTuple)) + { + Form_pg_statistic stats; + float4 *numbers; + int nnumbers; + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + null_frac = stats->stanullfrac; + + /* Try to get fraction of empty ranges */ + if (get_attstatsslot(vardata->statsTuple, + vardata->atttype, vardata->atttypmod, + STATISTIC_KIND_RANGE_EMPTY_FRAC, InvalidOid, + NULL, + NULL, NULL, + &numbers, &nnumbers)) + { + if (nnumbers != 1) + elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */ + empty_frac = numbers[0]; + } + else + { + /* No empty fraction statistic. Assume no empty ranges. */ + empty_frac = 0.0; + } + } + else + { + /* + * No stats are available. Follow through the calculations below + * anyway, assuming no NULLs and no empty ranges. This still allows + * us to give a better-than-nothing estimate based on whether the + * constant is an empty range or not. + */ + null_frac = 0.0; + empty_frac = 0.0; + } + + if (RangeIsEmpty(constval)) + { + /* + * An empty range matches all ranges, all empty ranges, or nothing, + * depending on the operator + */ + switch (operator) + { + case OID_RANGE_OVERLAP_OP: + case OID_RANGE_OVERLAPS_LEFT_OP: + case OID_RANGE_OVERLAPS_RIGHT_OP: + case OID_RANGE_LEFT_OP: + case OID_RANGE_RIGHT_OP: + /* these return false if either argument is empty */ + selec = 0.0; + break; + + case OID_RANGE_CONTAINED_OP: + case OID_RANGE_LESS_EQUAL_OP: + case OID_RANGE_GREATER_EQUAL_OP: + /* + * these return true when both args are empty, false if only + * one is empty + */ + selec = empty_frac; + break; + + case OID_RANGE_CONTAINS_OP: + /* everything contains an empty range */ + selec = 1.0; + break; + + case OID_RANGE_CONTAINS_ELEM_OP: + default: + elog(ERROR, "unexpected operator %u", operator); + selec = 0.0; /* keep compiler quiet */ + break; + } + } + else + { + /* + * Calculate selectivity using bound histograms. If that fails for + * some reason, e.g no histogram in pg_statistic, use the default + * constant estimate for the fraction of non-empty values. This is + * still somewhat better than just returning the default estimate, + * because this still takes into account the fraction of empty and + * NULL tuples, if we had statistics for them. + */ + hist_selec = calc_hist_selectivity(typcache, vardata, constval, + operator); + if (hist_selec < 0.0) + hist_selec = default_range_selectivity(operator); + + /* + * Now merge the results for the empty ranges and histogram + * calculations, realizing that the histogram covers only the + * non-null, non-empty values. + */ + if (operator == OID_RANGE_CONTAINED_OP) + { + /* empty is contained by anything non-empty */ + selec = (1.0 - empty_frac) * hist_selec + empty_frac; + } + else + { + /* with any other operator, empty Op non-empty matches nothing */ + selec = (1.0 - empty_frac) * hist_selec; + } + } + + /* all range operators are strict */ + selec *= (1.0 - null_frac); + + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(selec); + + return selec; +} + +/* + * Calculate range operator selectivity using histograms of range bounds. + * + * This estimate is for the portion of values that are not empty and not + * NULL. + */ +static double +calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata, + RangeType *constval, Oid operator) +{ + Datum *hist_values; + int nhist; + RangeBound *hist_lower; + RangeBound *hist_upper; + int i; + RangeBound const_lower; + RangeBound const_upper; + bool empty; + double hist_selec; + + /* Try to get histogram of ranges */ + if (!(HeapTupleIsValid(vardata->statsTuple) && + get_attstatsslot(vardata->statsTuple, + vardata->atttype, vardata->atttypmod, + STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid, + NULL, + &hist_values, &nhist, + NULL, NULL))) + return -1.0; + + /* + * Convert histogram of ranges into histograms of its lower and upper + * bounds. + */ + hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist); + hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist); + for (i = 0; i < nhist; i++) + { + range_deserialize(typcache, DatumGetRangeType(hist_values[i]), + &hist_lower[i], &hist_upper[i], &empty); + /* The histogram should not contain any empty ranges */ + if (empty) + elog(ERROR, "bounds histogram contains an empty range"); + } + + /* Extract the bounds of the constant value. */ + range_deserialize(typcache, constval, &const_lower, &const_upper, &empty); + Assert (!empty); + + /* + * Calculate selectivity comparing the lower or upper bound of the + * constant with the histogram of lower or upper bounds. + */ + switch (operator) + { + case OID_RANGE_LESS_OP: + /* + * The regular b-tree comparison operators (<, <=, >, >=) compare + * the lower bounds first, and the upper bounds for values with + * equal lower bounds. Estimate that by comparing the lower bounds + * only. This gives a fairly accurate estimate assuming there + * aren't many rows with a lower bound equal to the constant's + * lower bound. + */ + hist_selec = + calc_hist_selectivity_scalar(typcache, &const_lower, + hist_lower, nhist, false); + break; + + case OID_RANGE_LESS_EQUAL_OP: + hist_selec = + calc_hist_selectivity_scalar(typcache, &const_lower, + hist_lower, nhist, true); + break; + + case OID_RANGE_GREATER_OP: + hist_selec = + 1 - calc_hist_selectivity_scalar(typcache, &const_lower, + hist_lower, nhist, true); + break; + + case OID_RANGE_GREATER_EQUAL_OP: + hist_selec = + 1 - calc_hist_selectivity_scalar(typcache, &const_lower, + hist_lower, nhist, false); + break; + + case OID_RANGE_LEFT_OP: + /* var << const when upper(var) < lower(const) */ + hist_selec = + calc_hist_selectivity_scalar(typcache, &const_lower, + hist_upper, nhist, false); + break; + + case OID_RANGE_RIGHT_OP: + /* var >> const when lower(var) > upper(const) */ + hist_selec = + 1 - calc_hist_selectivity_scalar(typcache, &const_upper, + hist_lower, nhist, true); + break; + + case OID_RANGE_OVERLAPS_RIGHT_OP: + /* compare lower bounds */ + hist_selec = + 1 - calc_hist_selectivity_scalar(typcache, &const_lower, + hist_lower, nhist, false); + break; + + case OID_RANGE_OVERLAPS_LEFT_OP: + /* compare upper bounds */ + hist_selec = + calc_hist_selectivity_scalar(typcache, &const_upper, + hist_upper, nhist, true); + break; + + case OID_RANGE_OVERLAP_OP: + case OID_RANGE_CONTAINS_ELEM_OP: + /* + * A && B <=> NOT (A << B OR A >> B). + * + * "range @> elem" is equivalent to "range && [elem,elem]". The + * caller already constructed the singular range from the element + * constant, so just treat it the same as &&. + */ + hist_selec = + calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper, + nhist, false); + hist_selec += + (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower, + nhist, true)); + hist_selec = 1.0 - hist_selec; + break; + + case OID_RANGE_CONTAINS_OP: + case OID_RANGE_CONTAINED_OP: + /* TODO: not implemented yet */ + hist_selec = -1.0; + break; + + default: + elog(ERROR, "unknown range operator %u", operator); + hist_selec = -1.0; /* keep compiler quiet */ + break; + } + + return hist_selec; +} + + +/* + * Look up the fraction of values less than (or equal, if 'equal' argument + * is true) a given const in a histogram of range bounds. + */ +static double +calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound, + RangeBound *hist, int hist_nvalues, bool equal) +{ + Selectivity selec; + int index; + + /* + * Find the histogram bin the given constant falls into. Estimate + * selectivity as the number of preceding whole bins. + */ + index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal); + selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1); + + /* Adjust using linear interpolation within the bin */ + if (index >= 0 && index < hist_nvalues - 1) + selec += get_position(typcache, constbound, &hist[index], + &hist[index + 1]) / (Selectivity) (hist_nvalues - 1); + + return selec; +} + +/* + * Binary search on an array of range bounds. Returns greatest index of range + * bound in array which is less than given range bound. If all range bounds in + * array are greater or equal than given range bound, return -1. + */ +static int +rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist, + int hist_length, bool equal) +{ + int lower = -1, + upper = hist_length - 1, + cmp, + middle; + + while (lower < upper) + { + middle = (lower + upper + 1) / 2; + cmp = range_cmp_bounds(typcache, &hist[middle], value); + + if (cmp < 0 || (equal && cmp == 0)) + lower = middle; + else + upper = middle - 1; + } + return lower; +} + +/* + * Get relative position of value in histogram bin in [0,1] range. + */ +static float8 +get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1, + RangeBound *hist2) +{ + bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid); + float8 position; + + if (!hist1->infinite && !hist2->infinite) + { + float8 bin_width; + + /* + * Both bounds are finite. Assuming the subtype's comparison function + * works sanely, the value must be finite, too, because it lies + * somewhere between the bounds. If it doesn't, just return something. + */ + if (value->infinite) + return 0.5; + + /* Can't interpolate without subdiff function */ + if (!has_subdiff) + return 0.5; + + /* Calculate relative position using subdiff function. */ + bin_width = DatumGetFloat8(FunctionCall2Coll( + &typcache->rng_subdiff_finfo, + typcache->rng_collation, + hist2->val, + hist1->val)); + if (bin_width <= 0.0) + return 0.5; /* zero width bin */ + + position = DatumGetFloat8(FunctionCall2Coll( + &typcache->rng_subdiff_finfo, + typcache->rng_collation, + value->val, + hist1->val)) + / bin_width; + + /* Relative position must be in [0,1] range */ + position = Max(position, 0.0); + position = Min(position, 1.0); + return position; + } + else if (hist1->infinite && !hist2->infinite) + { + /* + * Lower bin boundary is -infinite, upper is finite. If the value is + * -infinite, return 0.0 to indicate it's equal to the lower bound. + * Otherwise return 1.0 to indicate it's infinitely far from the lower + * bound. + */ + return ((value->infinite && value->lower) ? 0.0 : 1.0); + } + else if (!hist1->infinite && hist2->infinite) + { + /* same as above, but in reverse */ + return ((value->infinite && !value->lower) ? 1.0 : 0.0); + } + else + { + /* + * If both bin boundaries are infinite, they should be equal to each + * other, and the value should also be infinite and equal to both + * bounds. (But don't Assert that, to avoid crashing if a user creates + * a datatype with a broken comparison function). + * + * Assume the value to lie in the middle of the infinite bounds. + */ + return 0.5; + } +} + diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c new file mode 100644 index 00000000000..e276ad8e181 --- /dev/null +++ b/src/backend/utils/adt/rangetypes_typanalyze.c @@ -0,0 +1,232 @@ +/*------------------------------------------------------------------------- + * + * ragetypes_typanalyze.c + * Functions for gathering statistics from range columns + * + * For a range type column, histograms of lower and upper bounds, and + * the fraction of NULL and empty ranges are collected. + * + * Both histograms have the same length, and they are combined into a + * single array of ranges. This has the same shape as the histogram that + * std_typanalyze would collect, but the values are different. Each range + * in the array is a valid range, even though the lower and upper bounds + * come from different tuples. In theory, the standard scalar selectivity + * functions could be used with the combined histogram. + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/utils/adt/rangetypes_typanalyze.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/pg_operator.h" +#include "commands/vacuum.h" +#include "utils/builtins.h" +#include "utils/rangetypes.h" + +static void compute_range_stats(VacAttrStats *stats, + AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows); + +/* + * range_typanalyze -- typanalyze function for range columns + */ +Datum +range_typanalyze(PG_FUNCTION_ARGS) +{ + VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0); + TypeCacheEntry *typcache; + Form_pg_attribute attr = stats->attr; + + /* Get information about range type */ + typcache = range_get_typcache(fcinfo, stats->attrtypid); + + if (attr->attstattarget < 0) + attr->attstattarget = default_statistics_target; + + stats->compute_stats = compute_range_stats; + stats->extra_data = typcache; + /* same as in std_typanalyze */ + stats->minrows = 300 * attr->attstattarget; + + PG_RETURN_BOOL(true); +} + +/* + * Comparison function for sorting RangeBounds. + */ +static int +range_bound_qsort_cmp(const void *a1, const void *a2, void *arg) +{ + RangeBound *b1 = (RangeBound *)a1; + RangeBound *b2 = (RangeBound *)a2; + TypeCacheEntry *typcache = (TypeCacheEntry *)arg; + + return range_cmp_bounds(typcache, b1, b2); +} + +/* + * compute_range_stats() -- compute statistics for a range column + */ +static void +compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, + int samplerows, double totalrows) +{ + TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data; + int null_cnt = 0; + int non_null_cnt = 0; + int non_empty_cnt = 0; + int empty_cnt = 0; + int range_no; + int slot_idx; + int num_bins = stats->attr->attstattarget; + int num_hist; + RangeBound *lowers, *uppers; + double total_width = 0; + + /* Allocate memory for arrays of range bounds. */ + lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows); + uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows); + + /* Loop over the sample ranges. */ + for (range_no = 0; range_no < samplerows; range_no++) + { + Datum value; + bool isnull, + empty; + RangeType *range; + RangeBound lower, + upper; + + vacuum_delay_point(); + + value = fetchfunc(stats, range_no, &isnull); + if (isnull) + { + /* range is null, just count that */ + null_cnt++; + continue; + } + + /* + * XXX: should we ignore wide values, like std_typanalyze does, to + * avoid bloating the statistics table? + */ + total_width += VARSIZE_ANY(DatumGetPointer(value)); + + /* Get range and deserialize it for further analysis. */ + range = DatumGetRangeType(value); + range_deserialize(typcache, range, &lower, &upper, &empty); + + if (!empty) + { + /* Fill bound values for further usage in histograms */ + lowers[non_empty_cnt] = lower; + uppers[non_empty_cnt] = upper; + non_empty_cnt++; + } + else + empty_cnt++; + + non_null_cnt++; + } + + slot_idx = 0; + + /* We can only compute real stats if we found some non-null values. */ + if (non_null_cnt > 0) + { + Datum *bound_hist_values; + int pos, + posfrac, + delta, + deltafrac, + i; + MemoryContext old_cxt; + float4 *emptyfrac; + + stats->stats_valid = true; + /* Do the simple null-frac and width stats */ + stats->stanullfrac = (double) null_cnt / (double) samplerows; + stats->stawidth = total_width / (double) non_null_cnt; + stats->stadistinct = -1.0; + + /* Must copy the target values into anl_context */ + old_cxt = MemoryContextSwitchTo(stats->anl_context); + + if (non_empty_cnt > 0) + { + /* Sort bound values */ + qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound), + range_bound_qsort_cmp, typcache); + qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound), + range_bound_qsort_cmp, typcache); + + num_hist = non_empty_cnt; + if (num_hist > num_bins) + num_hist = num_bins + 1; + + bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum)); + + /* + * The object of this loop is to construct ranges from first and + * last entries in lowers[] and uppers[] along with evenly-spaced + * values in between. So the i'th value is a range of + * lowers[(i * (nvals - 1)) / (num_hist - 1)] and + * uppers[(i * (nvals - 1)) / (num_hist - 1)]. But computing that + * subscript directly risks integer overflow when the stats target + * is more than a couple thousand. Instead we add + * (nvals - 1) / (num_hist - 1) to pos at each step, tracking the + * integral and fractional parts of the sum separately. + */ + delta = (non_empty_cnt - 1) / (num_hist - 1); + deltafrac = (non_empty_cnt - 1) % (num_hist - 1); + pos = posfrac = 0; + + for (i = 0; i < num_hist; i++) + { + bound_hist_values[i] = PointerGetDatum(range_serialize( + typcache, &lowers[pos], &uppers[pos], false)); + pos += delta; + posfrac += deltafrac; + if (posfrac >= (num_hist - 1)) + { + /* fractional part exceeds 1, carry to integer part */ + pos++; + posfrac -= (num_hist - 1); + } + } + + stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM; + stats->stavalues[slot_idx] = bound_hist_values; + stats->numvalues[slot_idx] = num_hist; + slot_idx++; + } + + /* Store the fraction of empty ranges */ + emptyfrac = (float4 *) palloc(sizeof(float4)); + *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt); + stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_EMPTY_FRAC; + stats->stanumbers[slot_idx] = emptyfrac; + stats->numnumbers[slot_idx] = 1; + slot_idx++; + + MemoryContextSwitchTo(old_cxt); + } + else if (null_cnt > 0) + { + /* We found only nulls; assume the column is entirely null */ + stats->stats_valid = true; + stats->stanullfrac = 1.0; + stats->stawidth = 0; /* "unknown" */ + stats->stadistinct = 0.0; /* "unknown" */ + } + /* + * We don't need to bother cleaning up any of our temporary palloc's. The + * hashtable should also go away, as it used a child memory context. + */ +} diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index da84f625203..3fb0b5b227c 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201208161 +#define CATALOG_VERSION_NO 201208271 #endif diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h index 94702541f55..abfec5cdb9d 100644 --- a/src/include/catalog/pg_operator.h +++ b/src/include/catalog/pg_operator.h @@ -1676,32 +1676,45 @@ DATA(insert OID = 3882 ( "=" PGNSP PGUID b t t 3831 3831 16 3882 3883 range_ DESCR("equal"); DATA(insert OID = 3883 ( "<>" PGNSP PGUID b f f 3831 3831 16 3883 3882 range_ne neqsel neqjoinsel )); DESCR("not equal"); -DATA(insert OID = 3884 ( "<" PGNSP PGUID b f f 3831 3831 16 3887 3886 range_lt scalarltsel scalarltjoinsel )); +DATA(insert OID = 3884 ( "<" PGNSP PGUID b f f 3831 3831 16 3887 3886 range_lt rangesel scalarltjoinsel )); DESCR("less than"); -DATA(insert OID = 3885 ( "<=" PGNSP PGUID b f f 3831 3831 16 3886 3887 range_le scalarltsel scalarltjoinsel )); +#define OID_RANGE_LESS_OP 3884 +DATA(insert OID = 3885 ( "<=" PGNSP PGUID b f f 3831 3831 16 3886 3887 range_le rangesel scalarltjoinsel )); DESCR("less than or equal"); -DATA(insert OID = 3886 ( ">=" PGNSP PGUID b f f 3831 3831 16 3885 3884 range_ge scalargtsel scalargtjoinsel )); +#define OID_RANGE_LESS_EQUAL_OP 3885 +DATA(insert OID = 3886 ( ">=" PGNSP PGUID b f f 3831 3831 16 3885 3884 range_ge rangesel scalargtjoinsel )); DESCR("greater than or equal"); -DATA(insert OID = 3887 ( ">" PGNSP PGUID b f f 3831 3831 16 3884 3885 range_gt scalargtsel scalargtjoinsel )); +#define OID_RANGE_GREATER_OP 3886 +DATA(insert OID = 3887 ( ">" PGNSP PGUID b f f 3831 3831 16 3884 3885 range_gt rangesel scalargtjoinsel )); DESCR("greater than"); -DATA(insert OID = 3888 ( "&&" PGNSP PGUID b f f 3831 3831 16 3888 0 range_overlaps areasel areajoinsel )); +#define OID_RANGE_GREATER_EQUAL_OP 3887 +DATA(insert OID = 3888 ( "&&" PGNSP PGUID b f f 3831 3831 16 3888 0 range_overlaps rangesel areajoinsel )); DESCR("overlaps"); -DATA(insert OID = 3889 ( "@>" PGNSP PGUID b f f 3831 2283 16 3891 0 range_contains_elem contsel contjoinsel )); +#define OID_RANGE_OVERLAP_OP 3888 +DATA(insert OID = 3889 ( "@>" PGNSP PGUID b f f 3831 2283 16 3891 0 range_contains_elem rangesel contjoinsel )); DESCR("contains"); -DATA(insert OID = 3890 ( "@>" PGNSP PGUID b f f 3831 3831 16 3892 0 range_contains contsel contjoinsel )); +#define OID_RANGE_CONTAINS_ELEM_OP 3889 +DATA(insert OID = 3890 ( "@>" PGNSP PGUID b f f 3831 3831 16 3892 0 range_contains rangesel contjoinsel )); DESCR("contains"); -DATA(insert OID = 3891 ( "<@" PGNSP PGUID b f f 2283 3831 16 3889 0 elem_contained_by_range contsel contjoinsel )); +#define OID_RANGE_CONTAINS_OP 3890 +DATA(insert OID = 3891 ( "<@" PGNSP PGUID b f f 2283 3831 16 3889 0 elem_contained_by_range rangesel contjoinsel )); DESCR("is contained by"); -DATA(insert OID = 3892 ( "<@" PGNSP PGUID b f f 3831 3831 16 3890 0 range_contained_by contsel contjoinsel )); +#define OID_RANGE_ELEM_CONTAINED_OP 3891 +DATA(insert OID = 3892 ( "<@" PGNSP PGUID b f f 3831 3831 16 3890 0 range_contained_by rangesel contjoinsel )); DESCR("is contained by"); -DATA(insert OID = 3893 ( "<<" PGNSP PGUID b f f 3831 3831 16 3894 0 range_before scalarltsel scalarltjoinsel )); +#define OID_RANGE_CONTAINED_OP 3892 +DATA(insert OID = 3893 ( "<<" PGNSP PGUID b f f 3831 3831 16 3894 0 range_before rangesel scalarltjoinsel )); DESCR("is left of"); -DATA(insert OID = 3894 ( ">>" PGNSP PGUID b f f 3831 3831 16 3893 0 range_after scalargtsel scalargtjoinsel )); +#define OID_RANGE_LEFT_OP 3893 +DATA(insert OID = 3894 ( ">>" PGNSP PGUID b f f 3831 3831 16 3893 0 range_after rangesel scalargtjoinsel )); DESCR("is right of"); -DATA(insert OID = 3895 ( "&<" PGNSP PGUID b f f 3831 3831 16 0 0 range_overleft scalarltsel scalarltjoinsel )); +#define OID_RANGE_RIGHT_OP 3894 +DATA(insert OID = 3895 ( "&<" PGNSP PGUID b f f 3831 3831 16 0 0 range_overleft rangesel scalarltjoinsel )); DESCR("overlaps or is left of"); -DATA(insert OID = 3896 ( "&>" PGNSP PGUID b f f 3831 3831 16 0 0 range_overright scalargtsel scalargtjoinsel )); +#define OID_RANGE_OVERLAPS_LEFT_OP 3895 +DATA(insert OID = 3896 ( "&>" PGNSP PGUID b f f 3831 3831 16 0 0 range_overright rangesel scalargtjoinsel )); DESCR("overlaps or is right of"); +#define OID_RANGE_OVERLAPS_RIGHT_OP 3896 DATA(insert OID = 3897 ( "-|-" PGNSP PGUID b f f 3831 3831 16 3897 0 range_adjacent contsel contjoinsel )); DESCR("is adjacent to"); DATA(insert OID = 3898 ( "+" PGNSP PGUID b f f 3831 3831 3831 3898 0 range_union - - )); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 51449a4f1c5..77a3b413ce5 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4544,6 +4544,8 @@ DATA(insert OID = 3902 ( hash_range PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 DESCR("hash a range"); DATA(insert OID = 3916 ( range_typanalyze PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ range_typanalyze _null_ _null_ _null_ )); DESCR("range typanalyze"); +DATA(insert OID = 3169 ( rangesel PGNSP PGUID 12 1 0 0 0 f f f f t f s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ rangesel _null_ _null_ _null_ )); +DESCR("restriction selectivity for range operators"); DATA(insert OID = 3914 ( int4range_canonical PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 3904 "3904" _null_ _null_ _null_ _null_ int4range_canonical _null_ _null_ _null_ )); DESCR("convert an int4 range to canonical form"); diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h index 8724cb50fa1..5df685e4bcb 100644 --- a/src/include/catalog/pg_statistic.h +++ b/src/include/catalog/pg_statistic.h @@ -268,4 +268,22 @@ typedef FormData_pg_statistic *Form_pg_statistic; */ #define STATISTIC_KIND_DECHIST 5 +/* + * An "empty frac" slot describes the fraction of empty ranges in a range-type + * column. stavalues is not used and should be NULL. stanumbers contains a + * single entry, the fraction of empty ranges (0.0 to 1.0). + */ +#define STATISTIC_KIND_RANGE_EMPTY_FRAC 6 + +/* + * A "bounds histogram" slot is similar to STATISTIC_KIND_HISTOGRAM, but for + * a range-type column. stavalues contains M (>=2) range values that divide + * the column data values into M-1 bins of approximately equal population. + * Unlike a regular scalar histogram, this is actually two histograms combined + * into a single array, with the lower bounds of each value forming a + * histogram of lower bounds, and the upper bounds a histogram of upper + * bounds. Only non-NULL, non-empty ranges are included. + */ +#define STATISTIC_KIND_BOUNDS_HISTOGRAM 7 + #endif /* PG_STATISTIC_H */ diff --git a/src/include/utils/rangetypes.h b/src/include/utils/rangetypes.h index fb0b23b4be0..a7eeda05c67 100644 --- a/src/include/utils/rangetypes.h +++ b/src/include/utils/rangetypes.h @@ -170,6 +170,7 @@ extern Datum hash_range(PG_FUNCTION_ARGS); /* ANALYZE support */ extern Datum range_typanalyze(PG_FUNCTION_ARGS); +extern Datum rangesel(PG_FUNCTION_ARGS); /* Canonical functions */ extern Datum int4range_canonical(PG_FUNCTION_ARGS); |