diff options
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/optimizer/path/clausesel.c | 245 | ||||
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 174 |
2 files changed, 345 insertions, 74 deletions
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index d3a494f9bc9..edce3d21291 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/path/clausesel.c,v 1.28 2000/01/23 02:06:58 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/path/clausesel.c,v 1.29 2000/01/24 07:16:46 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -23,6 +23,23 @@ #include "utils/lsyscache.h" +/* + * Data structure for accumulating info about possible range-query + * clause pairs in clauselist_selectivity. + */ +typedef struct RangeQueryClause { + struct RangeQueryClause *next; /* next in linked list */ + Node *var; /* The common variable of the clauses */ + bool have_lobound; /* found a low-bound clause yet? */ + bool have_hibound; /* found a high-bound clause yet? */ + Selectivity lobound; /* Selectivity of a var > something clause */ + Selectivity hibound; /* Selectivity of a var < something clause */ +} RangeQueryClause; + +static void addRangeClause(RangeQueryClause **rqlist, Node *clause, + int flag, bool isLTsel, Selectivity s2); + + /**************************************************************************** * ROUTINES TO COMPUTE SELECTIVITIES ****************************************************************************/ @@ -55,30 +72,238 @@ restrictlist_selectivity(Query *root, * must be returned. * * See clause_selectivity() for the meaning of the varRelid parameter. + * + * Our basic approach is to take the product of the selectivities of the + * subclauses. However, that's only right if the subclauses have independent + * probabilities, and in reality they are often NOT independent. So, + * we want to be smarter where we can. + + * Currently, the only extra smarts we have is to recognize "range queries", + * such as "x > 34 AND x < 42". Clauses are recognized as possible range + * query components if they are restriction opclauses whose operators have + * scalarltsel() or scalargtsel() as their restriction selectivity estimator. + * We pair up clauses of this form that refer to the same variable. An + * unpairable clause of this kind is simply multiplied into the selectivity + * product in the normal way. But when we find a pair, we know that the + * selectivities represent the relative positions of the low and high bounds + * within the column's range, so instead of figuring the selectivity as + * hisel * losel, we can figure it as hisel + losel - 1. (To visualize this, + * see that hisel is the fraction of the range below the high bound, while + * losel is the fraction above the low bound; so hisel can be interpreted + * directly as a 0..1 value but we need to convert losel to 1-losel before + * interpreting it as a value. Then the available range is 1-losel to hisel.) + * If the calculation yields zero or negative, however, we chicken out and + * use the default interpretation; that probably means that one or both + * selectivities is a default estimate rather than an actual range value. + * Of course this is all very dependent on the behavior of + * scalarltsel/scalargtsel; perhaps some day we can generalize the approach. */ Selectivity clauselist_selectivity(Query *root, List *clauses, int varRelid) { - Selectivity s1 = 1.0; - List *clause; + Selectivity s1 = 1.0; + RangeQueryClause *rqlist = NULL; + List *clist; - /* Use the product of the selectivities of the subclauses. - * XXX this is too optimistic, since the subclauses - * are very likely not independent... + /* + * Initial scan over clauses. Anything that doesn't look like a + * potential rangequery clause gets multiplied into s1 and forgotten. + * Anything that does gets inserted into an rqlist entry. */ - foreach(clause, clauses) + foreach(clist, clauses) { - Selectivity s2 = clause_selectivity(root, - (Node *) lfirst(clause), - varRelid); + Node *clause = (Node *) lfirst(clist); + Selectivity s2; + + /* + * See if it looks like a restriction clause with a constant. + * (If it's not a constant we can't really trust the selectivity!) + * NB: for consistency of results, this fragment of code had + * better match what clause_selectivity() would do. + */ + if (varRelid != 0 || NumRelids(clause) == 1) + { + int relidx; + AttrNumber attno; + Datum constval; + int flag; + + get_relattval(clause, varRelid, + &relidx, &attno, &constval, &flag); + if (relidx != 0 && (flag & SEL_CONSTANT)) + { + /* if get_relattval succeeded, it must be an opclause */ + Oid opno = ((Oper *) ((Expr *) clause)->oper)->opno; + RegProcedure oprrest = get_oprrest(opno); + + if (!oprrest) + s2 = (Selectivity) 0.5; + else + s2 = restriction_selectivity(oprrest, opno, + getrelid(relidx, + root->rtable), + attno, + constval, flag); + /* + * If we reach here, we have computed the same result + * that clause_selectivity would, so we can just use s2 + * if it's the wrong oprrest. But if it's the right + * oprrest, add the clause to rqlist for later processing. + */ + switch (oprrest) + { + case F_SCALARLTSEL: + addRangeClause(&rqlist, clause, flag, true, s2); + break; + case F_SCALARGTSEL: + addRangeClause(&rqlist, clause, flag, false, s2); + break; + default: + /* Just merge the selectivity in generically */ + s1 = s1 * s2; + break; + } + continue; /* drop to loop bottom */ + } + } + /* Not the right form, so treat it generically. */ + s2 = clause_selectivity(root, clause, varRelid); s1 = s1 * s2; } + + /* + * Now scan the rangequery pair list. + */ + while (rqlist != NULL) + { + RangeQueryClause *rqnext; + + if (rqlist->have_lobound && rqlist->have_hibound) + { + /* Successfully matched a pair of range clauses */ + Selectivity s2 = rqlist->hibound + rqlist->lobound - 1.0; + + if (s2 > 0.0) + { + /* All our hard work has paid off! */ + s1 *= s2; + } + else + { + /* One or both is probably a default estimate, + * so punt and just merge them in generically. + */ + s1 *= rqlist->hibound * rqlist->lobound; + } + } + else + { + /* Only found one of a pair, merge it in generically */ + if (rqlist->have_lobound) + s1 *= rqlist->lobound; + else + s1 *= rqlist->hibound; + } + /* release storage and advance */ + rqnext = rqlist->next; + pfree(rqlist); + rqlist = rqnext; + } + return s1; } /* + * addRangeClause --- add a new range clause for clauselist_selectivity + * + * Here is where we try to match up pairs of range-query clauses + */ +static void +addRangeClause(RangeQueryClause **rqlist, Node *clause, + int flag, bool isLTsel, Selectivity s2) +{ + RangeQueryClause *rqelem; + Node *var; + bool is_lobound; + + /* get_relattval sets flag&SEL_RIGHT if the var is on the LEFT. */ + if (flag & SEL_RIGHT) + { + var = (Node *) get_leftop((Expr *) clause); + is_lobound = ! isLTsel; /* x < something is high bound */ + } + else + { + var = (Node *) get_rightop((Expr *) clause); + is_lobound = isLTsel; /* something < x is low bound */ + } + + for (rqelem = *rqlist; rqelem; rqelem = rqelem->next) + { + /* We use full equal() here because the "var" might be a function + * of one or more attributes of the same relation... + */ + if (! equal(var, rqelem->var)) + continue; + /* Found the right group to put this clause in */ + if (is_lobound) + { + if (! rqelem->have_lobound) + { + rqelem->have_lobound = true; + rqelem->lobound = s2; + } + else + { + /* We have found two similar clauses, such as + * x < y AND x < z. Keep only the more restrictive one. + */ + if (rqelem->lobound > s2) + rqelem->lobound = s2; + } + } + else + { + if (! rqelem->have_hibound) + { + rqelem->have_hibound = true; + rqelem->hibound = s2; + } + else + { + /* We have found two similar clauses, such as + * x > y AND x > z. Keep only the more restrictive one. + */ + if (rqelem->hibound > s2) + rqelem->hibound = s2; + } + } + return; + } + + /* No matching var found, so make a new clause-pair data structure */ + rqelem = (RangeQueryClause *) palloc(sizeof(RangeQueryClause)); + rqelem->var = var; + if (is_lobound) + { + rqelem->have_lobound = true; + rqelem->have_hibound = false; + rqelem->lobound = s2; + } + else + { + rqelem->have_lobound = false; + rqelem->have_hibound = true; + rqelem->hibound = s2; + } + rqelem->next = *rqlist; + *rqlist = rqelem; +} + + +/* * clause_selectivity - * Compute the selectivity of a general boolean expression clause. * diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index c6b9520a263..346bda3bc7d 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -14,7 +14,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.52 2000/01/24 02:12:55 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.53 2000/01/24 07:16:46 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -47,15 +47,13 @@ /* default selectivity estimate for inequalities such as "A < b" */ #define DEFAULT_INEQ_SEL (1.0 / 3.0) -static bool convert_to_scale(Datum value, Oid typid, - double *scaleval); static void getattproperties(Oid relid, AttrNumber attnum, Oid *typid, int *typlen, bool *typbyval, int32 *typmod); static bool getattstatistics(Oid relid, AttrNumber attnum, - Oid opid, Oid typid, int32 typmod, + Oid typid, int32 typmod, double *nullfrac, double *commonfrac, Datum *commonval, @@ -100,7 +98,7 @@ eqsel(Oid opid, &typid, &typlen, &typbyval, &typmod); /* get stats for the attribute, if available */ - if (getattstatistics(relid, attno, opid, typid, typmod, + if (getattstatistics(relid, attno, typid, typmod, &nullfrac, &commonfrac, &commonval, NULL, NULL)) { @@ -212,19 +210,18 @@ neqsel(Oid opid, } /* - * intltsel - Selectivity of "<" (also "<=") for integers. + * scalarltsel - Selectivity of "<" (also "<=") for scalars. * - * Actually, this works and is used for all numeric types, so it should - * be renamed. In fact, it is also currently called for all manner of - * non-numeric types, for which it is NOT very helpful. That needs - * to be fixed. + * This routine works for any datatype (or pair of datatypes) known to + * convert_to_scalar(). If it is applied to some other datatype, + * it will return a default estimate. */ float64 -intltsel(Oid opid, - Oid relid, - AttrNumber attno, - Datum value, - int32 flag) +scalarltsel(Oid opid, + Oid relid, + AttrNumber attno, + Datum value, + int32 flag) { float64 result; @@ -253,19 +250,19 @@ intltsel(Oid opid, */ oprtuple = get_operator_tuple(opid); if (! HeapTupleIsValid(oprtuple)) - elog(ERROR, "intltsel: no tuple for operator %u", opid); + elog(ERROR, "scalarltsel: no tuple for operator %u", opid); ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft; rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright; /* Convert the constant to a uniform comparison scale. */ - if (! convert_to_scale(value, - ((flag & SEL_RIGHT) ? rtype : ltype), - &val)) + if (! convert_to_scalar(value, + ((flag & SEL_RIGHT) ? rtype : ltype), + &val)) { - /* Ideally we'd produce an error here, on the grounds that - * the given operator shouldn't have intltsel registered as its + /* Ideally we'd produce an error here, on the grounds that the + * given operator shouldn't have scalarltsel registered as its * selectivity func unless we can deal with its operand types. - * But currently, all manner of stuff is invoking intltsel, + * But currently, all manner of stuff is invoking scalarltsel, * so give a default estimate until that can be fixed. */ *result = DEFAULT_INEQ_SEL; @@ -276,7 +273,7 @@ intltsel(Oid opid, getattproperties(relid, attno, &typid, &typlen, &typbyval, &typmod); - if (! getattstatistics(relid, attno, opid, typid, typmod, + if (! getattstatistics(relid, attno, typid, typmod, NULL, NULL, NULL, &loval, &hival)) { @@ -286,8 +283,8 @@ intltsel(Oid opid, } /* Convert the attribute's loval/hival to common scale. */ - if (! convert_to_scale(loval, typid, &low) || - ! convert_to_scale(hival, typid, &high)) + if (! convert_to_scalar(loval, typid, &low) || + ! convert_to_scalar(hival, typid, &high)) { /* See above comments... */ if (! typbyval) @@ -341,23 +338,23 @@ intltsel(Oid opid, } /* - * intgtsel - Selectivity of ">" (also ">=") for integers. + * scalargtsel - Selectivity of ">" (also ">=") for integers. * - * See above comments for intltsel. + * See above comments for scalarltsel. */ float64 -intgtsel(Oid opid, - Oid relid, - AttrNumber attno, - Datum value, - int32 flag) +scalargtsel(Oid opid, + Oid relid, + AttrNumber attno, + Datum value, + int32 flag) { float64 result; /* Compute selectivity of "<", then invert --- but only if we * were able to produce a non-default estimate. */ - result = intltsel(opid, relid, attno, value, flag); + result = scalarltsel(opid, relid, attno, value, flag); if (*result != DEFAULT_INEQ_SEL) *result = 1.0 - *result; return result; @@ -429,14 +426,14 @@ neqjoinsel(Oid opid, } /* - * intltjoinsel - Join selectivity of "<" and "<=" + * scalarltjoinsel - Join selectivity of "<" and "<=" for scalars */ float64 -intltjoinsel(Oid opid, - Oid relid1, - AttrNumber attno1, - Oid relid2, - AttrNumber attno2) +scalarltjoinsel(Oid opid, + Oid relid1, + AttrNumber attno1, + Oid relid2, + AttrNumber attno2) { float64 result; @@ -446,14 +443,14 @@ intltjoinsel(Oid opid, } /* - * intgtjoinsel - Join selectivity of ">" and ">=" + * scalargtjoinsel - Join selectivity of ">" and ">=" for scalars */ float64 -intgtjoinsel(Oid opid, - Oid relid1, - AttrNumber attno1, - Oid relid2, - AttrNumber attno2) +scalargtjoinsel(Oid opid, + Oid relid1, + AttrNumber attno1, + Oid relid2, + AttrNumber attno2) { float64 result; @@ -463,21 +460,25 @@ intgtjoinsel(Oid opid, } /* - * convert_to_scale - * Convert a given value of the indicated type to the comparison - * scale needed by intltsel(). Returns "true" if successful. + * convert_to_scalar + * Convert a non-NULL value of the indicated type to the comparison + * scale needed by scalarltsel()/scalargtsel(). + * Returns "true" if successful. * * All numeric datatypes are simply converted to their equivalent - * "double" values. - * Future extension: convert string-like types to some suitable scale. + * "double" values. String datatypes are converted to a crude scale + * using their first character (only if it is in the ASCII range, + * to try to avoid problems with non-ASCII collating sequences). */ -static bool -convert_to_scale(Datum value, Oid typid, - double *scaleval) +bool +convert_to_scalar(Datum value, Oid typid, + double *scaleval) { - /* Fast-path conversions for some built-in types */ switch (typid) { + /* + * Built-in numeric types + */ case BOOLOID: *scaleval = (double) DatumGetUInt8(value); return true; @@ -504,18 +505,54 @@ convert_to_scale(Datum value, Oid typid, /* we can treat OIDs as integers... */ *scaleval = (double) DatumGetObjectId(value); return true; + + /* + * Built-in string types + */ + case CHAROID: + { + char ch = DatumGetChar(value); + + if (ch >= 0 && ch < 127) + { + *scaleval = (double) ch; + return true; + } + break; + } + case BPCHAROID: + case VARCHAROID: case TEXTOID: - /* - * Eventually this should get handled by somehow scaling as a - * string value. For now, we need to call it out to avoid - * falling into the default case, because there is a float8(text) - * function declared in pg_proc that will do the wrong thing :-( - */ + if (VARSIZE(DatumGetPointer(value)) > VARHDRSZ) + { + char ch = * (char *) VARDATA(DatumGetPointer(value)); + + if (ch >= 0 && ch < 127) + { + *scaleval = (double) ch; + return true; + } + } + break; + case NAMEOID: + { + NameData *nm = (NameData *) DatumGetPointer(value); + char ch = NameStr(*nm)[0]; + + if (ch >= 0 && ch < 127) + { + *scaleval = (double) ch; + return true; + } break; + } + default: { - /* See whether there is a registered type-conversion function, + /* + * See whether there is a registered type-conversion function, * namely a procedure named "float8" with the right signature. + * If so, assume we can convert the value to the numeric scale. */ Oid oid_array[FUNC_MAX_ARGS]; HeapTuple ftup; @@ -589,7 +626,9 @@ getattproperties(Oid relid, AttrNumber attnum, * after use if the data type is not by-value.) */ static bool -getattstatistics(Oid relid, AttrNumber attnum, Oid opid, Oid typid, +getattstatistics(Oid relid, + AttrNumber attnum, + Oid typid, int32 typmod, double *nullfrac, double *commonfrac, @@ -603,8 +642,15 @@ getattstatistics(Oid relid, AttrNumber attnum, Oid opid, Oid typid, Oid typelem; bool isnull; - /* We assume that there will only be one entry in pg_statistic - * for the given rel/att. Someday, VACUUM might store more than one... + /* + * We assume that there will only be one entry in pg_statistic for + * the given rel/att, so we search WITHOUT considering the staop + * column. Someday, VACUUM might store more than one entry per rel/att, + * corresponding to more than one possible sort ordering defined for + * the column type. However, to make that work we will need to figure + * out which staop to search for --- it's not necessarily the one we + * have at hand! (For example, we might have a '>' operator rather than + * the '<' operator that will appear in staop.) */ tuple = SearchSysCacheTuple(STATRELID, ObjectIdGetDatum(relid), |