aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/selfuncs.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2019-02-14 10:51:59 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2019-02-14 10:51:59 -0500
commit49fa99e54ec0ded180b52a4a14e543702d53e8c9 (patch)
tree64af70fc403c2354c5b05106d2afb5bb465142ad /src/backend/utils/adt/selfuncs.c
parent109de05cbb034b032cd60f50708716c8ff0afdf2 (diff)
downloadpostgresql-49fa99e54ec0ded180b52a4a14e543702d53e8c9.tar.gz
postgresql-49fa99e54ec0ded180b52a4a14e543702d53e8c9.zip
Move pattern selectivity code from selfuncs.c to like_support.c.
While at it, refactor patternsel() a bit so that it can be used from the LIKE/regex planner support functions as well. This makes the planner able to deal equally well with either operator or function syntax for these operations. I'm not excited about that as a feature in itself, but it provides a nice model for extensions to follow if they want such behavior for their operations. This change localizes the use of pattern_fixed_prefix() and make_greater_string() so that they no longer need be exported. (We might get pushback from extensions about that, perhaps, in which case I'd be inclined to re-export them in a new header file like_support.h.) This reduces the bulk of selfuncs.c a fair amount, removing ~1370 lines or about one-sixth of that file; it's still too big, but this is progress. Discussion: https://postgr.es/m/24537.1550093915@sss.pgh.pa.us
Diffstat (limited to 'src/backend/utils/adt/selfuncs.c')
-rw-r--r--src/backend/utils/adt/selfuncs.c1333
1 files changed, 8 insertions, 1325 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index b9f99fa050b..c25357a0088 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -110,12 +110,9 @@
#include "catalog/pg_am.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_operator.h"
-#include "catalog/pg_opfamily.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_statistic_ext.h"
-#include "catalog/pg_type.h"
#include "executor/executor.h"
-#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
@@ -125,14 +122,10 @@
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
#include "optimizer/plancat.h"
-#include "optimizer/restrictinfo.h"
#include "parser/parse_clause.h"
-#include "parser/parse_coerce.h"
#include "parser/parsetree.h"
#include "statistics/statistics.h"
-#include "utils/acl.h"
#include "utils/builtins.h"
-#include "utils/bytea.h"
#include "utils/date.h"
#include "utils/datum.h"
#include "utils/fmgroids.h"
@@ -146,7 +139,6 @@
#include "utils/syscache.h"
#include "utils/timestamp.h"
#include "utils/typcache.h"
-#include "utils/varlena.h"
/* Hooks for plugins to get control when we ask for stats */
@@ -154,16 +146,6 @@ get_relation_stats_hook_type get_relation_stats_hook = NULL;
get_index_stats_hook_type get_index_stats_hook = NULL;
static double eqsel_internal(PG_FUNCTION_ARGS, bool negate);
-static double var_eq_const(VariableStatData *vardata, Oid operator,
- Datum constval, bool constisnull,
- bool varonleft, bool negate);
-static double var_eq_non_const(VariableStatData *vardata, Oid operator,
- Node *other,
- bool varonleft, bool negate);
-static double ineq_histogram_selectivity(PlannerInfo *root,
- VariableStatData *vardata,
- FmgrInfo *opproc, bool isgt, bool iseq,
- Datum constval, Oid consttype);
static double eqjoinsel_inner(Oid opfuncoid,
VariableStatData *vardata1, VariableStatData *vardata2,
double nd1, double nd2,
@@ -215,17 +197,6 @@ static bool get_actual_variable_range(PlannerInfo *root,
Oid sortop,
Datum *min, Datum *max);
static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids);
-static Selectivity prefix_selectivity(PlannerInfo *root,
- VariableStatData *vardata,
- Oid vartype, Oid opfamily, Const *prefixcon);
-static Selectivity like_selectivity(const char *patt, int pattlen,
- bool case_insensitive);
-static Selectivity regex_selectivity(const char *patt, int pattlen,
- bool case_insensitive,
- int fixed_prefix_len);
-static Datum string_to_datum(const char *str, Oid datatype);
-static Const *string_to_const(const char *str, Oid datatype);
-static Const *string_to_bytea_const(const char *str, size_t str_len);
static IndexQualInfo *deconstruct_indexqual(RestrictInfo *rinfo,
IndexOptInfo *index, int indexcol);
static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals);
@@ -304,9 +275,9 @@ eqsel_internal(PG_FUNCTION_ARGS, bool negate)
/*
* var_eq_const --- eqsel for var = const case
*
- * This is split out so that some other estimation functions can use it.
+ * This is exported so that some other estimation functions can use it.
*/
-static double
+double
var_eq_const(VariableStatData *vardata, Oid operator,
Datum constval, bool constisnull,
bool varonleft, bool negate)
@@ -457,8 +428,10 @@ var_eq_const(VariableStatData *vardata, Oid operator,
/*
* var_eq_non_const --- eqsel for var = something-other-than-const case
+ *
+ * This is exported so that some other estimation functions can use it.
*/
-static double
+double
var_eq_non_const(VariableStatData *vardata, Oid operator,
Node *other,
bool varonleft, bool negate)
@@ -784,8 +757,10 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
* Note that the result disregards both the most-common-values (if any) and
* null entries. The caller is expected to combine this result with
* statistics for those portions of the column population.
+ *
+ * This is exported so that some other estimation functions can use it.
*/
-static double
+double
ineq_histogram_selectivity(PlannerInfo *root,
VariableStatData *vardata,
FmgrInfo *opproc, bool isgt, bool iseq,
@@ -1199,361 +1174,6 @@ scalargesel(PG_FUNCTION_ARGS)
}
/*
- * patternsel - Generic code for pattern-match selectivity.
- */
-static double
-patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
-{
- PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
- Oid operator = PG_GETARG_OID(1);
- List *args = (List *) PG_GETARG_POINTER(2);
- int varRelid = PG_GETARG_INT32(3);
- Oid collation = PG_GET_COLLATION();
- VariableStatData vardata;
- Node *other;
- bool varonleft;
- Datum constval;
- Oid consttype;
- Oid vartype;
- Oid opfamily;
- Pattern_Prefix_Status pstatus;
- Const *patt;
- Const *prefix = NULL;
- Selectivity rest_selec = 0;
- double nullfrac = 0.0;
- double result;
-
- /*
- * If this is for a NOT LIKE or similar operator, get the corresponding
- * positive-match operator and work with that. Set result to the correct
- * default estimate, too.
- */
- if (negate)
- {
- operator = get_negator(operator);
- if (!OidIsValid(operator))
- elog(ERROR, "patternsel called for operator without a negator");
- result = 1.0 - DEFAULT_MATCH_SEL;
- }
- else
- {
- result = DEFAULT_MATCH_SEL;
- }
-
- /*
- * If expression is not variable op constant, then punt and return a
- * default estimate.
- */
- if (!get_restriction_variable(root, args, varRelid,
- &vardata, &other, &varonleft))
- return result;
- if (!varonleft || !IsA(other, Const))
- {
- ReleaseVariableStats(vardata);
- return result;
- }
-
- /*
- * If the constant is NULL, assume operator is strict and return zero, ie,
- * operator will never return TRUE. (It's zero even for a negator op.)
- */
- if (((Const *) other)->constisnull)
- {
- ReleaseVariableStats(vardata);
- return 0.0;
- }
- constval = ((Const *) other)->constvalue;
- consttype = ((Const *) other)->consttype;
-
- /*
- * The right-hand const is type text or bytea for all supported operators.
- * We do not expect to see binary-compatible types here, since
- * const-folding should have relabeled the const to exactly match the
- * operator's declared type.
- */
- if (consttype != TEXTOID && consttype != BYTEAOID)
- {
- ReleaseVariableStats(vardata);
- return result;
- }
-
- /*
- * Similarly, the exposed type of the left-hand side should be one of
- * those we know. (Do not look at vardata.atttype, which might be
- * something binary-compatible but different.) We can use it to choose
- * the index opfamily from which we must draw the comparison operators.
- *
- * NOTE: It would be more correct to use the PATTERN opfamilies than the
- * simple ones, but at the moment ANALYZE will not generate statistics for
- * the PATTERN operators. But our results are so approximate anyway that
- * it probably hardly matters.
- */
- vartype = vardata.vartype;
-
- switch (vartype)
- {
- case TEXTOID:
- case NAMEOID:
- opfamily = TEXT_BTREE_FAM_OID;
- break;
- case BPCHAROID:
- opfamily = BPCHAR_BTREE_FAM_OID;
- break;
- case BYTEAOID:
- opfamily = BYTEA_BTREE_FAM_OID;
- break;
- default:
- ReleaseVariableStats(vardata);
- return result;
- }
-
- /*
- * Grab the nullfrac for use below.
- */
- if (HeapTupleIsValid(vardata.statsTuple))
- {
- Form_pg_statistic stats;
-
- stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
- nullfrac = stats->stanullfrac;
- }
-
- /*
- * Pull out any fixed prefix implied by the pattern, and estimate the
- * fractional selectivity of the remainder of the pattern. Unlike many of
- * the other functions in this file, we use the pattern operator's actual
- * collation for this step. This is not because we expect the collation
- * to make a big difference in the selectivity estimate (it seldom would),
- * but because we want to be sure we cache compiled regexps under the
- * right cache key, so that they can be re-used at runtime.
- */
- patt = (Const *) other;
- pstatus = pattern_fixed_prefix(patt, ptype, collation,
- &prefix, &rest_selec);
-
- /*
- * If necessary, coerce the prefix constant to the right type.
- */
- if (prefix && prefix->consttype != vartype)
- {
- char *prefixstr;
-
- switch (prefix->consttype)
- {
- case TEXTOID:
- prefixstr = TextDatumGetCString(prefix->constvalue);
- break;
- case BYTEAOID:
- prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
- prefix->constvalue));
- break;
- default:
- elog(ERROR, "unrecognized consttype: %u",
- prefix->consttype);
- ReleaseVariableStats(vardata);
- return result;
- }
- prefix = string_to_const(prefixstr, vartype);
- pfree(prefixstr);
- }
-
- if (pstatus == Pattern_Prefix_Exact)
- {
- /*
- * Pattern specifies an exact match, so pretend operator is '='
- */
- Oid eqopr = get_opfamily_member(opfamily, vartype, vartype,
- BTEqualStrategyNumber);
-
- if (eqopr == InvalidOid)
- elog(ERROR, "no = operator for opfamily %u", opfamily);
- result = var_eq_const(&vardata, eqopr, prefix->constvalue,
- false, true, false);
- }
- else
- {
- /*
- * Not exact-match pattern. If we have a sufficiently large
- * histogram, estimate selectivity for the histogram part of the
- * population by counting matches in the histogram. If not, estimate
- * selectivity of the fixed prefix and remainder of pattern
- * separately, then combine the two to get an estimate of the
- * selectivity for the part of the column population represented by
- * the histogram. (For small histograms, we combine these
- * approaches.)
- *
- * We then add up data for any most-common-values values; these are
- * not in the histogram population, and we can get exact answers for
- * them by applying the pattern operator, so there's no reason to
- * approximate. (If the MCVs cover a significant part of the total
- * population, this gives us a big leg up in accuracy.)
- */
- Selectivity selec;
- int hist_size;
- FmgrInfo opproc;
- double mcv_selec,
- sumcommon;
-
- /* Try to use the histogram entries to get selectivity */
- fmgr_info(get_opcode(operator), &opproc);
-
- selec = histogram_selectivity(&vardata, &opproc, constval, true,
- 10, 1, &hist_size);
-
- /* If not at least 100 entries, use the heuristic method */
- if (hist_size < 100)
- {
- Selectivity heursel;
- Selectivity prefixsel;
-
- if (pstatus == Pattern_Prefix_Partial)
- prefixsel = prefix_selectivity(root, &vardata, vartype,
- opfamily, prefix);
- else
- prefixsel = 1.0;
- heursel = prefixsel * rest_selec;
-
- if (selec < 0) /* fewer than 10 histogram entries? */
- selec = heursel;
- else
- {
- /*
- * For histogram sizes from 10 to 100, we combine the
- * histogram and heuristic selectivities, putting increasingly
- * more trust in the histogram for larger sizes.
- */
- double hist_weight = hist_size / 100.0;
-
- selec = selec * hist_weight + heursel * (1.0 - hist_weight);
- }
- }
-
- /* In any case, don't believe extremely small or large estimates. */
- if (selec < 0.0001)
- selec = 0.0001;
- else if (selec > 0.9999)
- selec = 0.9999;
-
- /*
- * If we have most-common-values info, add up the fractions of the MCV
- * entries that satisfy MCV OP PATTERN. These fractions contribute
- * directly to the result selectivity. Also add up the total fraction
- * represented by MCV entries.
- */
- mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
- &sumcommon);
-
- /*
- * Now merge the results from the MCV and histogram calculations,
- * realizing that the histogram covers only the non-null values that
- * are not listed in MCV.
- */
- selec *= 1.0 - nullfrac - sumcommon;
- selec += mcv_selec;
- result = selec;
- }
-
- /* now adjust if we wanted not-match rather than match */
- if (negate)
- result = 1.0 - result - nullfrac;
-
- /* result should be in range, but make sure... */
- CLAMP_PROBABILITY(result);
-
- if (prefix)
- {
- pfree(DatumGetPointer(prefix->constvalue));
- pfree(prefix);
- }
-
- ReleaseVariableStats(vardata);
-
- return result;
-}
-
-/*
- * regexeqsel - Selectivity of regular-expression pattern match.
- */
-Datum
-regexeqsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
-}
-
-/*
- * icregexeqsel - Selectivity of case-insensitive regex match.
- */
-Datum
-icregexeqsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
-}
-
-/*
- * likesel - Selectivity of LIKE pattern match.
- */
-Datum
-likesel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
-}
-
-/*
- * prefixsel - selectivity of prefix operator
- */
-Datum
-prefixsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
-}
-
-/*
- *
- * iclikesel - Selectivity of ILIKE pattern match.
- */
-Datum
-iclikesel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
-}
-
-/*
- * regexnesel - Selectivity of regular-expression pattern non-match.
- */
-Datum
-regexnesel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
-}
-
-/*
- * icregexnesel - Selectivity of case-insensitive regex non-match.
- */
-Datum
-icregexnesel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
-}
-
-/*
- * nlikesel - Selectivity of LIKE pattern non-match.
- */
-Datum
-nlikesel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
-}
-
-/*
- * icnlikesel - Selectivity of ILIKE pattern non-match.
- */
-Datum
-icnlikesel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
-}
-
-/*
* boolvarsel - Selectivity of Boolean variable.
*
* This can actually be called on any boolean-valued expression. If it
@@ -2896,96 +2516,6 @@ scalargejoinsel(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
}
-/*
- * patternjoinsel - Generic code for pattern-match join selectivity.
- */
-static double
-patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
-{
- /* For the moment we just punt. */
- return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
-}
-
-/*
- * regexeqjoinsel - Join selectivity of regular-expression pattern match.
- */
-Datum
-regexeqjoinsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
-}
-
-/*
- * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
- */
-Datum
-icregexeqjoinsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
-}
-
-/*
- * likejoinsel - Join selectivity of LIKE pattern match.
- */
-Datum
-likejoinsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
-}
-
-/*
- * prefixjoinsel - Join selectivity of prefix operator
- */
-Datum
-prefixjoinsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
-}
-
-/*
- * iclikejoinsel - Join selectivity of ILIKE pattern match.
- */
-Datum
-iclikejoinsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
-}
-
-/*
- * regexnejoinsel - Join selectivity of regex non-match.
- */
-Datum
-regexnejoinsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
-}
-
-/*
- * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
- */
-Datum
-icregexnejoinsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
-}
-
-/*
- * nlikejoinsel - Join selectivity of LIKE pattern non-match.
- */
-Datum
-nlikejoinsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
-}
-
-/*
- * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
- */
-Datum
-icnlikejoinsel(PG_FUNCTION_ARGS)
-{
- PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
-}
/*
* mergejoinscansel - Scan selectivity of merge join.
@@ -5718,853 +5248,6 @@ find_join_input_rel(PlannerInfo *root, Relids relids)
/*-------------------------------------------------------------------------
*
- * Pattern analysis functions
- *
- * These routines support analysis of LIKE and regular-expression patterns
- * by the planner/optimizer. It's important that they agree with the
- * regular-expression code in backend/regex/ and the LIKE code in
- * backend/utils/adt/like.c. Also, the computation of the fixed prefix
- * must be conservative: if we report a string longer than the true fixed
- * prefix, the query may produce actually wrong answers, rather than just
- * getting a bad selectivity estimate!
- *
- * Note that the prefix-analysis functions are called from
- * backend/optimizer/path/indxpath.c as well as from routines in this file.
- *
- *-------------------------------------------------------------------------
- */
-
-/*
- * Check whether char is a letter (and, hence, subject to case-folding)
- *
- * In multibyte character sets or with ICU, we can't use isalpha, and it does not seem
- * worth trying to convert to wchar_t to use iswalpha. Instead, just assume
- * any multibyte char is potentially case-varying.
- */
-static int
-pattern_char_isalpha(char c, bool is_multibyte,
- pg_locale_t locale, bool locale_is_c)
-{
- if (locale_is_c)
- return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
- else if (is_multibyte && IS_HIGHBIT_SET(c))
- return true;
- else if (locale && locale->provider == COLLPROVIDER_ICU)
- return IS_HIGHBIT_SET(c) ? true : false;
-#ifdef HAVE_LOCALE_T
- else if (locale && locale->provider == COLLPROVIDER_LIBC)
- return isalpha_l((unsigned char) c, locale->info.lt);
-#endif
- else
- return isalpha((unsigned char) c);
-}
-
-/*
- * Extract the fixed prefix, if any, for a pattern.
- *
- * *prefix is set to a palloc'd prefix string (in the form of a Const node),
- * or to NULL if no fixed prefix exists for the pattern.
- * If rest_selec is not NULL, *rest_selec is set to an estimate of the
- * selectivity of the remainder of the pattern (without any fixed prefix).
- * The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
- *
- * The return value distinguishes no fixed prefix, a partial prefix,
- * or an exact-match-only pattern.
- */
-
-static Pattern_Prefix_Status
-like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
- Const **prefix_const, Selectivity *rest_selec)
-{
- char *match;
- char *patt;
- int pattlen;
- Oid typeid = patt_const->consttype;
- int pos,
- match_pos;
- bool is_multibyte = (pg_database_encoding_max_length() > 1);
- pg_locale_t locale = 0;
- bool locale_is_c = false;
-
- /* the right-hand const is type text or bytea */
- Assert(typeid == BYTEAOID || typeid == TEXTOID);
-
- if (case_insensitive)
- {
- if (typeid == BYTEAOID)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("case insensitive matching not supported on type bytea")));
-
- /* If case-insensitive, we need locale info */
- if (lc_ctype_is_c(collation))
- locale_is_c = true;
- else if (collation != DEFAULT_COLLATION_OID)
- {
- if (!OidIsValid(collation))
- {
- /*
- * This typically means that the parser could not resolve a
- * conflict of implicit collations, so report it that way.
- */
- ereport(ERROR,
- (errcode(ERRCODE_INDETERMINATE_COLLATION),
- errmsg("could not determine which collation to use for ILIKE"),
- errhint("Use the COLLATE clause to set the collation explicitly.")));
- }
- locale = pg_newlocale_from_collation(collation);
- }
- }
-
- if (typeid != BYTEAOID)
- {
- patt = TextDatumGetCString(patt_const->constvalue);
- pattlen = strlen(patt);
- }
- else
- {
- bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
-
- pattlen = VARSIZE_ANY_EXHDR(bstr);
- patt = (char *) palloc(pattlen);
- memcpy(patt, VARDATA_ANY(bstr), pattlen);
- Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
- }
-
- match = palloc(pattlen + 1);
- match_pos = 0;
- for (pos = 0; pos < pattlen; pos++)
- {
- /* % and _ are wildcard characters in LIKE */
- if (patt[pos] == '%' ||
- patt[pos] == '_')
- break;
-
- /* Backslash escapes the next character */
- if (patt[pos] == '\\')
- {
- pos++;
- if (pos >= pattlen)
- break;
- }
-
- /* Stop if case-varying character (it's sort of a wildcard) */
- if (case_insensitive &&
- pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
- break;
-
- match[match_pos++] = patt[pos];
- }
-
- match[match_pos] = '\0';
-
- if (typeid != BYTEAOID)
- *prefix_const = string_to_const(match, typeid);
- else
- *prefix_const = string_to_bytea_const(match, match_pos);
-
- if (rest_selec != NULL)
- *rest_selec = like_selectivity(&patt[pos], pattlen - pos,
- case_insensitive);
-
- pfree(patt);
- pfree(match);
-
- /* in LIKE, an empty pattern is an exact match! */
- if (pos == pattlen)
- return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
-
- if (match_pos > 0)
- return Pattern_Prefix_Partial;
-
- return Pattern_Prefix_None;
-}
-
-static Pattern_Prefix_Status
-regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
- Const **prefix_const, Selectivity *rest_selec)
-{
- Oid typeid = patt_const->consttype;
- char *prefix;
- bool exact;
-
- /*
- * Should be unnecessary, there are no bytea regex operators defined. As
- * such, it should be noted that the rest of this function has *not* been
- * made safe for binary (possibly NULL containing) strings.
- */
- if (typeid == BYTEAOID)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("regular-expression matching not supported on type bytea")));
-
- /* Use the regexp machinery to extract the prefix, if any */
- prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
- case_insensitive, collation,
- &exact);
-
- if (prefix == NULL)
- {
- *prefix_const = NULL;
-
- if (rest_selec != NULL)
- {
- char *patt = TextDatumGetCString(patt_const->constvalue);
-
- *rest_selec = regex_selectivity(patt, strlen(patt),
- case_insensitive,
- 0);
- pfree(patt);
- }
-
- return Pattern_Prefix_None;
- }
-
- *prefix_const = string_to_const(prefix, typeid);
-
- if (rest_selec != NULL)
- {
- if (exact)
- {
- /* Exact match, so there's no additional selectivity */
- *rest_selec = 1.0;
- }
- else
- {
- char *patt = TextDatumGetCString(patt_const->constvalue);
-
- *rest_selec = regex_selectivity(patt, strlen(patt),
- case_insensitive,
- strlen(prefix));
- pfree(patt);
- }
- }
-
- pfree(prefix);
-
- if (exact)
- return Pattern_Prefix_Exact; /* pattern specifies exact match */
- else
- return Pattern_Prefix_Partial;
-}
-
-Pattern_Prefix_Status
-pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
- Const **prefix, Selectivity *rest_selec)
-{
- Pattern_Prefix_Status result;
-
- switch (ptype)
- {
- case Pattern_Type_Like:
- result = like_fixed_prefix(patt, false, collation,
- prefix, rest_selec);
- break;
- case Pattern_Type_Like_IC:
- result = like_fixed_prefix(patt, true, collation,
- prefix, rest_selec);
- break;
- case Pattern_Type_Regex:
- result = regex_fixed_prefix(patt, false, collation,
- prefix, rest_selec);
- break;
- case Pattern_Type_Regex_IC:
- result = regex_fixed_prefix(patt, true, collation,
- prefix, rest_selec);
- break;
- case Pattern_Type_Prefix:
- /* Prefix type work is trivial. */
- result = Pattern_Prefix_Partial;
- *rest_selec = 1.0; /* all */
- *prefix = makeConst(patt->consttype,
- patt->consttypmod,
- patt->constcollid,
- patt->constlen,
- datumCopy(patt->constvalue,
- patt->constbyval,
- patt->constlen),
- patt->constisnull,
- patt->constbyval);
- break;
- default:
- elog(ERROR, "unrecognized ptype: %d", (int) ptype);
- result = Pattern_Prefix_None; /* keep compiler quiet */
- break;
- }
- return result;
-}
-
-/*
- * Estimate the selectivity of a fixed prefix for a pattern match.
- *
- * A fixed prefix "foo" is estimated as the selectivity of the expression
- * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
- *
- * The selectivity estimate is with respect to the portion of the column
- * population represented by the histogram --- the caller must fold this
- * together with info about MCVs and NULLs.
- *
- * We use the >= and < operators from the specified btree opfamily to do the
- * estimation. The given variable and Const must be of the associated
- * datatype.
- *
- * XXX Note: we make use of the upper bound to estimate operator selectivity
- * even if the locale is such that we cannot rely on the upper-bound string.
- * The selectivity only needs to be approximately right anyway, so it seems
- * more useful to use the upper-bound code than not.
- */
-static Selectivity
-prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
- Oid vartype, Oid opfamily, Const *prefixcon)
-{
- Selectivity prefixsel;
- Oid cmpopr;
- FmgrInfo opproc;
- AttStatsSlot sslot;
- Const *greaterstrcon;
- Selectivity eq_sel;
-
- cmpopr = get_opfamily_member(opfamily, vartype, vartype,
- BTGreaterEqualStrategyNumber);
- if (cmpopr == InvalidOid)
- elog(ERROR, "no >= operator for opfamily %u", opfamily);
- fmgr_info(get_opcode(cmpopr), &opproc);
-
- prefixsel = ineq_histogram_selectivity(root, vardata,
- &opproc, true, true,
- prefixcon->constvalue,
- prefixcon->consttype);
-
- if (prefixsel < 0.0)
- {
- /* No histogram is present ... return a suitable default estimate */
- return DEFAULT_MATCH_SEL;
- }
-
- /*-------
- * If we can create a string larger than the prefix, say
- * "x < greaterstr". We try to generate the string referencing the
- * collation of the var's statistics, but if that's not available,
- * use DEFAULT_COLLATION_OID.
- *-------
- */
- if (HeapTupleIsValid(vardata->statsTuple) &&
- get_attstatsslot(&sslot, vardata->statsTuple,
- STATISTIC_KIND_HISTOGRAM, InvalidOid, 0))
- /* sslot.stacoll is set up */ ;
- else
- sslot.stacoll = DEFAULT_COLLATION_OID;
- cmpopr = get_opfamily_member(opfamily, vartype, vartype,
- BTLessStrategyNumber);
- if (cmpopr == InvalidOid)
- elog(ERROR, "no < operator for opfamily %u", opfamily);
- fmgr_info(get_opcode(cmpopr), &opproc);
- greaterstrcon = make_greater_string(prefixcon, &opproc, sslot.stacoll);
- if (greaterstrcon)
- {
- Selectivity topsel;
-
- topsel = ineq_histogram_selectivity(root, vardata,
- &opproc, false, false,
- greaterstrcon->constvalue,
- greaterstrcon->consttype);
-
- /* ineq_histogram_selectivity worked before, it shouldn't fail now */
- Assert(topsel >= 0.0);
-
- /*
- * Merge the two selectivities in the same way as for a range query
- * (see clauselist_selectivity()). Note that we don't need to worry
- * about double-exclusion of nulls, since ineq_histogram_selectivity
- * doesn't count those anyway.
- */
- prefixsel = topsel + prefixsel - 1.0;
- }
-
- /*
- * If the prefix is long then the two bounding values might be too close
- * together for the histogram to distinguish them usefully, resulting in a
- * zero estimate (plus or minus roundoff error). To avoid returning a
- * ridiculously small estimate, compute the estimated selectivity for
- * "variable = 'foo'", and clamp to that. (Obviously, the resultant
- * estimate should be at least that.)
- *
- * We apply this even if we couldn't make a greater string. That case
- * suggests that the prefix is near the maximum possible, and thus
- * probably off the end of the histogram, and thus we probably got a very
- * small estimate from the >= condition; so we still need to clamp.
- */
- cmpopr = get_opfamily_member(opfamily, vartype, vartype,
- BTEqualStrategyNumber);
- if (cmpopr == InvalidOid)
- elog(ERROR, "no = operator for opfamily %u", opfamily);
- eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue,
- false, true, false);
-
- prefixsel = Max(prefixsel, eq_sel);
-
- return prefixsel;
-}
-
-
-/*
- * Estimate the selectivity of a pattern of the specified type.
- * Note that any fixed prefix of the pattern will have been removed already,
- * so actually we may be looking at just a fragment of the pattern.
- *
- * For now, we use a very simplistic approach: fixed characters reduce the
- * selectivity a good deal, character ranges reduce it a little,
- * wildcards (such as % for LIKE or .* for regex) increase it.
- */
-
-#define FIXED_CHAR_SEL 0.20 /* about 1/5 */
-#define CHAR_RANGE_SEL 0.25
-#define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
-#define FULL_WILDCARD_SEL 5.0
-#define PARTIAL_WILDCARD_SEL 2.0
-
-static Selectivity
-like_selectivity(const char *patt, int pattlen, bool case_insensitive)
-{
- Selectivity sel = 1.0;
- int pos;
-
- /* Skip any leading wildcard; it's already factored into initial sel */
- for (pos = 0; pos < pattlen; pos++)
- {
- if (patt[pos] != '%' && patt[pos] != '_')
- break;
- }
-
- for (; pos < pattlen; pos++)
- {
- /* % and _ are wildcard characters in LIKE */
- if (patt[pos] == '%')
- sel *= FULL_WILDCARD_SEL;
- else if (patt[pos] == '_')
- sel *= ANY_CHAR_SEL;
- else if (patt[pos] == '\\')
- {
- /* Backslash quotes the next character */
- pos++;
- if (pos >= pattlen)
- break;
- sel *= FIXED_CHAR_SEL;
- }
- else
- sel *= FIXED_CHAR_SEL;
- }
- /* Could get sel > 1 if multiple wildcards */
- if (sel > 1.0)
- sel = 1.0;
- return sel;
-}
-
-static Selectivity
-regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
-{
- Selectivity sel = 1.0;
- int paren_depth = 0;
- int paren_pos = 0; /* dummy init to keep compiler quiet */
- int pos;
-
- for (pos = 0; pos < pattlen; pos++)
- {
- if (patt[pos] == '(')
- {
- if (paren_depth == 0)
- paren_pos = pos; /* remember start of parenthesized item */
- paren_depth++;
- }
- else if (patt[pos] == ')' && paren_depth > 0)
- {
- paren_depth--;
- if (paren_depth == 0)
- sel *= regex_selectivity_sub(patt + (paren_pos + 1),
- pos - (paren_pos + 1),
- case_insensitive);
- }
- else if (patt[pos] == '|' && paren_depth == 0)
- {
- /*
- * If unquoted | is present at paren level 0 in pattern, we have
- * multiple alternatives; sum their probabilities.
- */
- sel += regex_selectivity_sub(patt + (pos + 1),
- pattlen - (pos + 1),
- case_insensitive);
- break; /* rest of pattern is now processed */
- }
- else if (patt[pos] == '[')
- {
- bool negclass = false;
-
- if (patt[++pos] == '^')
- {
- negclass = true;
- pos++;
- }
- if (patt[pos] == ']') /* ']' at start of class is not special */
- pos++;
- while (pos < pattlen && patt[pos] != ']')
- pos++;
- if (paren_depth == 0)
- sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
- }
- else if (patt[pos] == '.')
- {
- if (paren_depth == 0)
- sel *= ANY_CHAR_SEL;
- }
- else if (patt[pos] == '*' ||
- patt[pos] == '?' ||
- patt[pos] == '+')
- {
- /* Ought to be smarter about quantifiers... */
- if (paren_depth == 0)
- sel *= PARTIAL_WILDCARD_SEL;
- }
- else if (patt[pos] == '{')
- {
- while (pos < pattlen && patt[pos] != '}')
- pos++;
- if (paren_depth == 0)
- sel *= PARTIAL_WILDCARD_SEL;
- }
- else if (patt[pos] == '\\')
- {
- /* backslash quotes the next character */
- pos++;
- if (pos >= pattlen)
- break;
- if (paren_depth == 0)
- sel *= FIXED_CHAR_SEL;
- }
- else
- {
- if (paren_depth == 0)
- sel *= FIXED_CHAR_SEL;
- }
- }
- /* Could get sel > 1 if multiple wildcards */
- if (sel > 1.0)
- sel = 1.0;
- return sel;
-}
-
-static Selectivity
-regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
- int fixed_prefix_len)
-{
- Selectivity sel;
-
- /* If patt doesn't end with $, consider it to have a trailing wildcard */
- if (pattlen > 0 && patt[pattlen - 1] == '$' &&
- (pattlen == 1 || patt[pattlen - 2] != '\\'))
- {
- /* has trailing $ */
- sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
- }
- else
- {
- /* no trailing $ */
- sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
- sel *= FULL_WILDCARD_SEL;
- }
-
- /* If there's a fixed prefix, discount its selectivity */
- if (fixed_prefix_len > 0)
- sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len);
-
- /* Make sure result stays in range */
- CLAMP_PROBABILITY(sel);
- return sel;
-}
-
-
-/*
- * For bytea, the increment function need only increment the current byte
- * (there are no multibyte characters to worry about).
- */
-static bool
-byte_increment(unsigned char *ptr, int len)
-{
- if (*ptr >= 255)
- return false;
- (*ptr)++;
- return true;
-}
-
-/*
- * Try to generate a string greater than the given string or any
- * string it is a prefix of. If successful, return a palloc'd string
- * in the form of a Const node; else return NULL.
- *
- * The caller must provide the appropriate "less than" comparison function
- * for testing the strings, along with the collation to use.
- *
- * The key requirement here is that given a prefix string, say "foo",
- * we must be able to generate another string "fop" that is greater than
- * all strings "foobar" starting with "foo". We can test that we have
- * generated a string greater than the prefix string, but in non-C collations
- * that is not a bulletproof guarantee that an extension of the string might
- * not sort after it; an example is that "foo " is less than "foo!", but it
- * is not clear that a "dictionary" sort ordering will consider "foo!" less
- * than "foo bar". CAUTION: Therefore, this function should be used only for
- * estimation purposes when working in a non-C collation.
- *
- * To try to catch most cases where an extended string might otherwise sort
- * before the result value, we determine which of the strings "Z", "z", "y",
- * and "9" is seen as largest by the collation, and append that to the given
- * prefix before trying to find a string that compares as larger.
- *
- * To search for a greater string, we repeatedly "increment" the rightmost
- * character, using an encoding-specific character incrementer function.
- * When it's no longer possible to increment the last character, we truncate
- * off that character and start incrementing the next-to-rightmost.
- * For example, if "z" were the last character in the sort order, then we
- * could produce "foo" as a string greater than "fonz".
- *
- * This could be rather slow in the worst case, but in most cases we
- * won't have to try more than one or two strings before succeeding.
- *
- * Note that it's important for the character incrementer not to be too anal
- * about producing every possible character code, since in some cases the only
- * way to get a larger string is to increment a previous character position.
- * So we don't want to spend too much time trying every possible character
- * code at the last position. A good rule of thumb is to be sure that we
- * don't try more than 256*K values for a K-byte character (and definitely
- * not 256^K, which is what an exhaustive search would approach).
- */
-Const *
-make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
-{
- Oid datatype = str_const->consttype;
- char *workstr;
- int len;
- Datum cmpstr;
- char *cmptxt = NULL;
- mbcharacter_incrementer charinc;
-
- /*
- * Get a modifiable copy of the prefix string in C-string format, and set
- * up the string we will compare to as a Datum. In C locale this can just
- * be the given prefix string, otherwise we need to add a suffix. Type
- * BYTEA sorts bytewise so it never needs a suffix either.
- */
- if (datatype == BYTEAOID)
- {
- bytea *bstr = DatumGetByteaPP(str_const->constvalue);
-
- len = VARSIZE_ANY_EXHDR(bstr);
- workstr = (char *) palloc(len);
- memcpy(workstr, VARDATA_ANY(bstr), len);
- Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
- cmpstr = str_const->constvalue;
- }
- else
- {
- if (datatype == NAMEOID)
- workstr = DatumGetCString(DirectFunctionCall1(nameout,
- str_const->constvalue));
- else
- workstr = TextDatumGetCString(str_const->constvalue);
- len = strlen(workstr);
- if (lc_collate_is_c(collation) || len == 0)
- cmpstr = str_const->constvalue;
- else
- {
- /* If first time through, determine the suffix to use */
- static char suffixchar = 0;
- static Oid suffixcollation = 0;
-
- if (!suffixchar || suffixcollation != collation)
- {
- char *best;
-
- best = "Z";
- if (varstr_cmp(best, 1, "z", 1, collation) < 0)
- best = "z";
- if (varstr_cmp(best, 1, "y", 1, collation) < 0)
- best = "y";
- if (varstr_cmp(best, 1, "9", 1, collation) < 0)
- best = "9";
- suffixchar = *best;
- suffixcollation = collation;
- }
-
- /* And build the string to compare to */
- if (datatype == NAMEOID)
- {
- cmptxt = palloc(len + 2);
- memcpy(cmptxt, workstr, len);
- cmptxt[len] = suffixchar;
- cmptxt[len + 1] = '\0';
- cmpstr = PointerGetDatum(cmptxt);
- }
- else
- {
- cmptxt = palloc(VARHDRSZ + len + 1);
- SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
- memcpy(VARDATA(cmptxt), workstr, len);
- *(VARDATA(cmptxt) + len) = suffixchar;
- cmpstr = PointerGetDatum(cmptxt);
- }
- }
- }
-
- /* Select appropriate character-incrementer function */
- if (datatype == BYTEAOID)
- charinc = byte_increment;
- else
- charinc = pg_database_encoding_character_incrementer();
-
- /* And search ... */
- while (len > 0)
- {
- int charlen;
- unsigned char *lastchar;
-
- /* Identify the last character --- for bytea, just the last byte */
- if (datatype == BYTEAOID)
- charlen = 1;
- else
- charlen = len - pg_mbcliplen(workstr, len, len - 1);
- lastchar = (unsigned char *) (workstr + len - charlen);
-
- /*
- * Try to generate a larger string by incrementing the last character
- * (for BYTEA, we treat each byte as a character).
- *
- * Note: the incrementer function is expected to return true if it's
- * generated a valid-per-the-encoding new character, otherwise false.
- * The contents of the character on false return are unspecified.
- */
- while (charinc(lastchar, charlen))
- {
- Const *workstr_const;
-
- if (datatype == BYTEAOID)
- workstr_const = string_to_bytea_const(workstr, len);
- else
- workstr_const = string_to_const(workstr, datatype);
-
- if (DatumGetBool(FunctionCall2Coll(ltproc,
- collation,
- cmpstr,
- workstr_const->constvalue)))
- {
- /* Successfully made a string larger than cmpstr */
- if (cmptxt)
- pfree(cmptxt);
- pfree(workstr);
- return workstr_const;
- }
-
- /* No good, release unusable value and try again */
- pfree(DatumGetPointer(workstr_const->constvalue));
- pfree(workstr_const);
- }
-
- /*
- * No luck here, so truncate off the last character and try to
- * increment the next one.
- */
- len -= charlen;
- workstr[len] = '\0';
- }
-
- /* Failed... */
- if (cmptxt)
- pfree(cmptxt);
- pfree(workstr);
-
- return NULL;
-}
-
-/*
- * Generate a Datum of the appropriate type from a C string.
- * Note that all of the supported types are pass-by-ref, so the
- * returned value should be pfree'd if no longer needed.
- */
-static Datum
-string_to_datum(const char *str, Oid datatype)
-{
- Assert(str != NULL);
-
- /*
- * We cheat a little by assuming that CStringGetTextDatum() will do for
- * bpchar and varchar constants too...
- */
- if (datatype == NAMEOID)
- return DirectFunctionCall1(namein, CStringGetDatum(str));
- else if (datatype == BYTEAOID)
- return DirectFunctionCall1(byteain, CStringGetDatum(str));
- else
- return CStringGetTextDatum(str);
-}
-
-/*
- * Generate a Const node of the appropriate type from a C string.
- */
-static Const *
-string_to_const(const char *str, Oid datatype)
-{
- Datum conval = string_to_datum(str, datatype);
- Oid collation;
- int constlen;
-
- /*
- * We only need to support a few datatypes here, so hard-wire properties
- * instead of incurring the expense of catalog lookups.
- */
- switch (datatype)
- {
- case TEXTOID:
- case VARCHAROID:
- case BPCHAROID:
- collation = DEFAULT_COLLATION_OID;
- constlen = -1;
- break;
-
- case NAMEOID:
- collation = C_COLLATION_OID;
- constlen = NAMEDATALEN;
- break;
-
- case BYTEAOID:
- collation = InvalidOid;
- constlen = -1;
- break;
-
- default:
- elog(ERROR, "unexpected datatype in string_to_const: %u",
- datatype);
- return NULL;
- }
-
- return makeConst(datatype, -1, collation, constlen,
- conval, false, false);
-}
-
-/*
- * Generate a Const node of bytea type from a binary C string and a length.
- */
-static Const *
-string_to_bytea_const(const char *str, size_t str_len)
-{
- bytea *bstr = palloc(VARHDRSZ + str_len);
- Datum conval;
-
- memcpy(VARDATA(bstr), str, str_len);
- SET_VARSIZE(bstr, VARHDRSZ + str_len);
- conval = PointerGetDatum(bstr);
-
- return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
-}
-
-/*-------------------------------------------------------------------------
- *
* Index cost estimation functions
*
*-------------------------------------------------------------------------