Implement multivariate n-distinct coefficients

Add support for explicitly declared statistic objects (CREATE STATISTICS), allowing collection of statistics on more complex combinations that individual table columns. Companion commands DROP STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are added too. All this DDL has been designed so that more statistic types can be added later on, such as multivariate most-common-values and multivariate histograms between columns of a single table, leaving room for permitting columns on multiple tables, too, as well as expressions. This commit only adds support for collection of n-distinct coefficient on user-specified sets of columns in a single table. This is useful to estimate number of distinct groups in GROUP BY and DISTINCT clauses; estimation errors there can cause over-allocation of memory in hashed aggregates, for instance, so it's a worthwhile problem to solve. A new special pseudo-type pg_ndistinct is used. (num-distinct estimation was deemed sufficiently useful by itself that this is worthwhile even if no further statistic types are added immediately; so much so that another version of essentially the same functionality was submitted by Kyotaro Horiguchi: https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp though this commit does not use that code.) Author: Tomas Vondra. Some code rework by Álvaro. Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes, Ideriha Takeshi Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
author: Alvaro Herrera <alvherre@alvh.no-ip.org> 2017-03-24 14:06:10 -0300
committer: Alvaro Herrera <alvherre@alvh.no-ip.org> 2017-03-24 14:06:10 -0300
commit: 7b504eb282ca2f5104b5c00b4f05a3ef6bb1385b (patch)
tree: 4b12f53c5bd25a03f1016f1daa0809606b47df3a /src/backend/utils/adt/selfuncs.c
parent: f120b614e070aed39586d1443193738a149a90d4 (diff)
download: postgresql-7b504eb282ca2f5104b5c00b4f05a3ef6bb1385b.tar.gz
postgresql-7b504eb282ca2f5104b5c00b4f05a3ef6bb1385b.zip
1 files changed, 174 insertions, 7 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index f8b28fe0e61..cc24c8aeb56 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -110,6 +110,7 @@
 #include "catalog/pg_operator.h"
 #include "catalog/pg_opfamily.h"
 #include "catalog/pg_statistic.h"
+#include "catalog/pg_statistic_ext.h"
 #include "catalog/pg_type.h"
 #include "executor/executor.h"
 #include "mb/pg_wchar.h"
@@ -126,6 +127,7 @@
 #include "parser/parse_clause.h"
 #include "parser/parse_coerce.h"
 #include "parser/parsetree.h"
+#include "statistics/statistics.h"
 #include "utils/builtins.h"
 #include "utils/bytea.h"
 #include "utils/date.h"
@@ -164,6 +166,8 @@ static double eqjoinsel_inner(Oid operator,
 static double eqjoinsel_semi(Oid operator,
 			   VariableStatData *vardata1, VariableStatData *vardata2,
 			   RelOptInfo *inner_rel);
+static bool estimate_multivariate_ndistinct(PlannerInfo *root,
+			   RelOptInfo *rel, List **varinfos, double *ndistinct);
 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 				  Datum lobound, Datum hibound, Oid boundstypid,
 				  double *scaledlobound, double *scaledhibound);
@@ -3398,25 +3402,25 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 	{
 		GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
 		RelOptInfo *rel = varinfo1->rel;
-		double		reldistinct = varinfo1->ndistinct;
+		double		reldistinct = 1;
 		double		relmaxndistinct = reldistinct;
 		int			relvarcount = 1;
 		List	   *newvarinfos = NIL;
+		List	   *relvarinfos = NIL;
 
 		/*
-		 * Get the product of numdistinct estimates of the Vars for this rel.
-		 * Also, construct new varinfos list of remaining Vars.
+		 * Split the list of varinfos in two - one for the current rel,
+		 * one for remaining Vars on other rels.
 		 */
+		relvarinfos = lcons(varinfo1, relvarinfos);
 		for_each_cell(l, lnext(list_head(varinfos)))
 		{
 			GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
 
 			if (varinfo2->rel == varinfo1->rel)
 			{
-				reldistinct *= varinfo2->ndistinct;
-				if (relmaxndistinct < varinfo2->ndistinct)
-					relmaxndistinct = varinfo2->ndistinct;
-				relvarcount++;
+				/* varinfos on current rel */
+				relvarinfos = lcons(varinfo2, relvarinfos);
 			}
 			else
 			{
@@ -3426,6 +3430,43 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		}
 
 		/*
+		 * Get the numdistinct estimate for the Vars of this rel.  We
+		 * iteratively search for multivariate n-distinct with maximum number
+		 * of vars; assuming that each var group is independent of the others,
+		 * we multiply them together.  Any remaining relvarinfos after
+		 * no more multivariate matches are found are assumed independent too,
+		 * so their individual ndistinct estimates are multiplied also.
+		 */
+		while (relvarinfos)
+		{
+			double		mvndistinct;
+
+			if (estimate_multivariate_ndistinct(root, rel, &relvarinfos,
+												&mvndistinct))
+			{
+				reldistinct *= mvndistinct;
+				if (relmaxndistinct < mvndistinct)
+					relmaxndistinct = mvndistinct;
+				relvarcount++;	/* inaccurate, but doesn't matter */
+			}
+			else
+			{
+				foreach (l, relvarinfos)
+				{
+					GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
+
+					reldistinct *= varinfo2->ndistinct;
+					if (relmaxndistinct < varinfo2->ndistinct)
+						relmaxndistinct = varinfo2->ndistinct;
+					relvarcount++;
+				}
+
+				/* we're done with this relation */
+				relvarinfos = NIL;
+			}
+		}
+
+		/*
 		 * Sanity check --- don't divide by zero if empty relation.
 		 */
 		Assert(rel->reloptkind == RELOPT_BASEREL);
@@ -3668,6 +3709,132 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
  */
 
 /*
+ * Find applicable ndistinct statistics for the given list of VarInfos (which
+ * must all belong to the given rel), and update *ndistinct to the estimate of
+ * the MVNDistinctItem that best matches.  If a match it found, *varinfos is
+ * updated to remove the list of matched varinfos.
+ *
+ * Varinfos that aren't for simple Vars are ignored.
+ *
+ * Return TRUE if we're able to find a match, FALSE otherwise.
+ */
+static bool
+estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
+								List **varinfos, double *ndistinct)
+{
+	ListCell   *lc;
+	Bitmapset  *attnums = NULL;
+	int			nmatches;
+	Oid			statOid = InvalidOid;
+	MVNDistinct *stats;
+	Bitmapset  *matched = NULL;
+
+	/* bail out immediately if the table has no extended statistics */
+	if (!rel->statlist)
+		return false;
+
+	/* Determine the attnums we're looking for */
+	foreach(lc, *varinfos)
+	{
+		GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc);
+
+		Assert(varinfo->rel == rel);
+
+		if (IsA(varinfo->var, Var))
+		{
+			attnums = bms_add_member(attnums,
+									 ((Var *) varinfo->var)->varattno);
+		}
+	}
+
+	/* look for the ndistinct statistics matching the most vars */
+	nmatches = 1; /* we require at least two matches */
+	foreach(lc, rel->statlist)
+	{
+		StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
+		Bitmapset  *shared;
+
+		/* skip statistics of other kinds */
+		if (info->kind != STATS_EXT_NDISTINCT)
+			continue;
+
+		/* compute attnums shared by the vars and the statistic */
+		shared = bms_intersect(info->keys, attnums);
+
+		/*
+		 * Does this statistics matches more columns than the currently
+		 * best statistic?  If so, use this one instead.
+		 *
+		 * XXX This should break ties using name of the statistic, or
+		 * something like that, to make the outcome stable.
+		 */
+		if (bms_num_members(shared) > nmatches)
+		{
+			statOid = info->statOid;
+			nmatches = bms_num_members(shared);
+			matched = shared;
+		}
+	}
+
+	/* No match? */
+	if (statOid == InvalidOid)
+		return false;
+	Assert(nmatches > 1 && matched != NULL);
+
+	stats = statext_ndistinct_load(statOid);
+
+	/*
+	 * If we have a match, search it for the specific item that matches (there
+	 * must be one), and construct the output values.
+	 */
+	if (stats)
+	{
+		int		i;
+		List   *newlist = NIL;
+		MVNDistinctItem *item = NULL;
+
+		/* Find the specific item that exactly matches the combination */
+		for (i = 0; i < stats->nitems; i++)
+		{
+			MVNDistinctItem *tmpitem = &stats->items[i];
+
+			if (bms_subset_compare(tmpitem->attrs, matched) == BMS_EQUAL)
+			{
+				item = tmpitem;
+				break;
+			}
+		}
+
+		/* make sure we found an item */
+		if (!item)
+			elog(ERROR, "corrupt MVNDistinct entry");
+
+		/* Form the output varinfo list, keeping only unmatched ones */
+		foreach(lc, *varinfos)
+		{
+			GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc);
+			AttrNumber	attnum;
+
+			if (!IsA(varinfo->var, Var))
+			{
+				newlist = lappend(newlist, varinfo);
+				continue;
+			}
+
+			attnum = ((Var *) varinfo->var)->varattno;
+			if (!bms_is_member(attnum, matched))
+				newlist = lappend(newlist, varinfo);
+		}
+
+		*varinfos = newlist;
+		*ndistinct = item->ndistinct;
+		return true;
+	}
+
+	return false;
+}
+
+/*
  * convert_to_scalar
  *	  Convert non-NULL values of the indicated types to the comparison
  *	  scale needed by scalarineqsel().
author	Alvaro Herrera <alvherre@alvh.no-ip.org>	2017-03-24 14:06:10 -0300
committer	Alvaro Herrera <alvherre@alvh.no-ip.org>	2017-03-24 14:06:10 -0300
commit	7b504eb282ca2f5104b5c00b4f05a3ef6bb1385b (patch)
tree	4b12f53c5bd25a03f1016f1daa0809606b47df3a /src/backend/utils/adt/selfuncs.c
parent	f120b614e070aed39586d1443193738a149a90d4 (diff)
download	postgresql-7b504eb282ca2f5104b5c00b4f05a3ef6bb1385b.tar.gz postgresql-7b504eb282ca2f5104b5c00b4f05a3ef6bb1385b.zip