aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/src/sgml/catalogs.sgml6
-rw-r--r--src/backend/commands/analyze.c16
-rw-r--r--src/backend/tsearch/ts_typanalyze.c2
-rw-r--r--src/backend/utils/adt/rangetypes_typanalyze.c4
-rw-r--r--src/backend/utils/adt/selfuncs.c12
-rw-r--r--src/include/catalog/pg_statistic.h15
6 files changed, 34 insertions, 21 deletions
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 8fd055b0090..ccb9b97a8cb 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -5849,9 +5849,9 @@
<entry>The number of distinct nonnull data values in the column.
A value greater than zero is the actual number of distinct values.
A value less than zero is the negative of a multiplier for the number
- of rows in the table; for example, a column in which values appear about
- twice on the average could be represented by
- <structfield>stadistinct</> = -0.5.
+ of rows in the table; for example, a column in which about 80% of the
+ values are nonnull and each nonnull value appears about twice on
+ average could be represented by <structfield>stadistinct</> = -0.4.
A zero value means the number of distinct values is unknown.
</entry>
</row>
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 5fcedd78554..9ac71220a2a 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2049,8 +2049,11 @@ compute_distinct_stats(VacAttrStatsP stats,
if (nmultiple == 0)
{
- /* If we found no repeated values, assume it's a unique column */
- stats->stadistinct = -1.0;
+ /*
+ * If we found no repeated non-null values, assume it's a unique
+ * column; but be sure to discount for any nulls we found.
+ */
+ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
}
else if (track_cnt < track_max && toowide_cnt == 0 &&
nmultiple == track_cnt)
@@ -2426,8 +2429,11 @@ compute_scalar_stats(VacAttrStatsP stats,
if (nmultiple == 0)
{
- /* If we found no repeated values, assume it's a unique column */
- stats->stadistinct = -1.0;
+ /*
+ * If we found no repeated non-null values, assume it's a unique
+ * column; but be sure to discount for any nulls we found.
+ */
+ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
}
else if (toowide_cnt == 0 && nmultiple == ndistinct)
{
@@ -2753,7 +2759,7 @@ compute_scalar_stats(VacAttrStatsP stats,
else
stats->stawidth = stats->attrtype->typlen;
/* Assume all too-wide values are distinct, so it's a unique column */
- stats->stadistinct = -1.0;
+ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
}
else if (null_cnt > 0)
{
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 0f851ead060..817453ce011 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -295,7 +295,7 @@ compute_tsvector_stats(VacAttrStats *stats,
stats->stawidth = total_width / (double) nonnull_cnt;
/* Assume it's a unique column (see notes above) */
- stats->stadistinct = -1.0;
+ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
/*
* Construct an array of the interesting hashtable items, that is,
diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c
index fcb71d3f36e..56504fcf3c2 100644
--- a/src/backend/utils/adt/rangetypes_typanalyze.c
+++ b/src/backend/utils/adt/rangetypes_typanalyze.c
@@ -203,7 +203,9 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
/* Do the simple null-frac and width stats */
stats->stanullfrac = (double) null_cnt / (double) samplerows;
stats->stawidth = total_width / (double) non_null_cnt;
- stats->stadistinct = -1.0;
+
+ /* Estimate that non-null values are unique */
+ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
/* Must copy the target values into anl_context */
old_cxt = MemoryContextSwitchTo(stats->anl_context);
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index cc2a9a1b6c5..56943f2a87a 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4738,6 +4738,7 @@ double
get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
{
double stadistinct;
+ double stanullfrac = 0.0;
double ntuples;
*isdefault = false;
@@ -4745,7 +4746,8 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
/*
* Determine the stadistinct value to use. There are cases where we can
* get an estimate even without a pg_statistic entry, or can get a better
- * value than is in pg_statistic.
+ * value than is in pg_statistic. Grab stanullfrac too if we can find it
+ * (otherwise, assume no nulls, for lack of any better idea).
*/
if (HeapTupleIsValid(vardata->statsTuple))
{
@@ -4754,6 +4756,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
stadistinct = stats->stadistinct;
+ stanullfrac = stats->stanullfrac;
}
else if (vardata->vartype == BOOLOID)
{
@@ -4777,7 +4780,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
{
case ObjectIdAttributeNumber:
case SelfItemPointerAttributeNumber:
- stadistinct = -1.0; /* unique */
+ stadistinct = -1.0; /* unique (and all non null) */
break;
case TableOidAttributeNumber:
stadistinct = 1.0; /* only 1 value */
@@ -4799,10 +4802,11 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
* If there is a unique index or DISTINCT clause for the variable, assume
* it is unique no matter what pg_statistic says; the statistics could be
* out of date, or we might have found a partial unique index that proves
- * the var is unique for this query.
+ * the var is unique for this query. However, we'd better still believe
+ * the null-fraction statistic.
*/
if (vardata->isunique)
- stadistinct = -1.0;
+ stadistinct = -1.0 * (1.0 - stanullfrac);
/*
* If we had an absolute estimate, use that.
diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h
index 5d775a94fda..c1a624d979a 100644
--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -57,13 +57,14 @@ CATALOG(pg_statistic,2619) BKI_WITHOUT_OIDS
* > 0 actual number of distinct values
* < 0 negative of multiplier for number of rows
* The special negative case allows us to cope with columns that are
- * unique (stadistinct = -1) or nearly so (for example, a column in
- * which values appear about twice on the average could be represented
- * by stadistinct = -0.5). Because the number-of-rows statistic in
- * pg_class may be updated more frequently than pg_statistic is, it's
- * important to be able to describe such situations as a multiple of
- * the number of rows, rather than a fixed number of distinct values.
- * But in other cases a fixed number is correct (eg, a boolean column).
+ * unique (stadistinct = -1) or nearly so (for example, a column in which
+ * non-null values appear about twice on the average could be represented
+ * by stadistinct = -0.5 if there are no nulls, or -0.4 if 20% of the
+ * column is nulls). Because the number-of-rows statistic in pg_class may
+ * be updated more frequently than pg_statistic is, it's important to be
+ * able to describe such situations as a multiple of the number of rows,
+ * rather than a fixed number of distinct values. But in other cases a
+ * fixed number is correct (eg, a boolean column).
* ----------------
*/
float4 stadistinct;