diff options
Diffstat (limited to 'doc/src')
-rw-r--r-- | doc/src/sgml/catalogs.sgml | 13 | ||||
-rw-r--r-- | doc/src/sgml/func.sgml | 82 | ||||
-rw-r--r-- | doc/src/sgml/perform.sgml | 66 | ||||
-rw-r--r-- | doc/src/sgml/planstats.sgml | 116 | ||||
-rw-r--r-- | doc/src/sgml/ref/create_statistics.sgml | 35 |
5 files changed, 307 insertions, 5 deletions
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 45ed077654e..026c422cd22 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -6562,7 +6562,8 @@ SCRAM-SHA-256$<replaceable><iteration count></replaceable>:<replaceable>&l An array containing codes for the enabled statistic kinds; valid values are: <literal>d</literal> for n-distinct statistics, - <literal>f</literal> for functional dependency statistics + <literal>f</literal> for functional dependency statistics, and + <literal>m</literal> for most common values (MCV) list statistics </entry> </row> @@ -6585,6 +6586,16 @@ SCRAM-SHA-256$<replaceable><iteration count></replaceable>:<replaceable>&l </entry> </row> + <row> + <entry><structfield>stxmcv</structfield></entry> + <entry><type>pg_mcv_list</type></entry> + <entry></entry> + <entry> + MCV (most-common values) list statistics, serialized as + <structname>pg_mcv_list</structname> type. + </entry> + </row> + </tbody> </tgroup> </table> diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 1a014732919..d24901126f0 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -22174,4 +22174,86 @@ CREATE EVENT TRIGGER test_table_rewrite_oid </sect2> </sect1> + <sect1 id="functions-statistics"> + <title>Statistics Information Functions</title> + + <indexterm zone="functions-statistics"> + <primary>function</primary> + <secondary>statistics</secondary> + </indexterm> + + <para> + <productname>PostgreSQL</productname> provides a function to inspect complex + statistics defined using the <command>CREATE STATISTICS</command> command. + </para> + + <sect2 id="functions-statistics-mcv"> + <title>Inspecting MCV lists</title> + + <indexterm> + <primary>pg_mcv_list_items</primary> + <secondary>pg_mcv_list</secondary> + </indexterm> + + <para> + <function>pg_mcv_list_items</function> returns a list of all items + stored in a multi-column <acronym>MCV</acronym> list, and returns the + following columns: + + <informaltable> + <tgroup cols="3"> + <thead> + <row> + <entry>Name</entry> + <entry>Type</entry> + <entry>Description</entry> + </row> + </thead> + + <tbody> + <row> + <entry><literal>index</literal></entry> + <entry><type>int</type></entry> + <entry>index of the item in the <acronym>MCV</acronym> list</entry> + </row> + <row> + <entry><literal>values</literal></entry> + <entry><type>text[]</type></entry> + <entry>values stored in the MCV item</entry> + </row> + <row> + <entry><literal>nulls</literal></entry> + <entry><type>boolean[]</type></entry> + <entry>flags identifying <literal>NULL</literal> values</entry> + </row> + <row> + <entry><literal>frequency</literal></entry> + <entry><type>double precision</type></entry> + <entry>frequency of this <acronym>MCV</acronym> item</entry> + </row> + <row> + <entry><literal>base_frequency</literal></entry> + <entry><type>double precision</type></entry> + <entry>base frequency of this <acronym>MCV</acronym> item</entry> + </row> + </tbody> + </tgroup> + </informaltable> + </para> + + <para> + The <function>pg_mcv_list_items</function> function can be used like this: + +<programlisting> +SELECT m.* FROM pg_statistic_ext, + pg_mcv_list_items(stxmcv) m WHERE stxname = 'stts'; +</programlisting> + + Values of the <type>pg_mcv_list</type> can be obtained only from the + <literal>pg_statistic_ext.stxmcv</literal> column. + </para> + </sect2> + + </sect1> + </chapter> diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 783d708073d..a84be851593 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1285,6 +1285,72 @@ nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178} plans. Otherwise, the <command>ANALYZE</command> cycles are just wasted. </para> </sect3> + + <sect3> + <title>Multivariate MCV lists</title> + + <para> + Another type of statistics stored for each column are most-common value + lists. This allows very accurate estimates for individual columns, but + may result in significant misestimates for queries with conditions on + multiple columns. + </para> + + <para> + To improve such estimates, <command>ANALYZE</command> can collect MCV + lists on combinations of columns. Similarly to functional dependencies + and n-distinct coefficients, it's impractical to do this for every + possible column grouping. Even more so in this case, as the MCV list + (unlike functional dependencies and n-distinct coefficients) does store + the common column values. So data is collected only for those groups + of columns appearing together in a statistics object defined with the + <literal>mcv</literal> option. + </para> + + <para> + Continuing the previous example, the MCV list for a table of ZIP codes + might look like the following (unlike for simpler types of statistics, + a function is required for inspection of MCV contents): + +<programlisting> +CREATE STATISTICS stts3 (mcv) ON state, city FROM zipcodes; + +ANALYZE zipcodes; + +SELECT m.* FROM pg_statistic_ext, + pg_mcv_list_items(stxmcv) m WHERE stxname = 'stts3'; + + index | values | nulls | frequency | base_frequency +-------+------------------------+-------+-----------+---------------- + 0 | {Washington, DC} | {f,f} | 0.003467 | 2.7e-05 + 1 | {Apo, AE} | {f,f} | 0.003067 | 1.9e-05 + 2 | {Houston, TX} | {f,f} | 0.002167 | 0.000133 + 3 | {El Paso, TX} | {f,f} | 0.002 | 0.000113 + 4 | {New York, NY} | {f,f} | 0.001967 | 0.000114 + 5 | {Atlanta, GA} | {f,f} | 0.001633 | 3.3e-05 + 6 | {Sacramento, CA} | {f,f} | 0.001433 | 7.8e-05 + 7 | {Miami, FL} | {f,f} | 0.0014 | 6e-05 + 8 | {Dallas, TX} | {f,f} | 0.001367 | 8.8e-05 + 9 | {Chicago, IL} | {f,f} | 0.001333 | 5.1e-05 + ... +(99 rows) +</programlisting> + This indicates that the most common combination of city and state is + Washington in DC, with actual frequency (in the sample) about 0.35%. + The base frequency of the combination (as computed from the simple + per-column frequencies) is only 0.0027%, resulting in two orders of + magnitude under-estimates. + </para> + + <para> + It's advisable to create <acronym>MCV</acronym> statistics objects only + on combinations of columns that are actually used in conditions together, + and for which misestimation of the number of groups is resulting in bad + plans. Otherwise, the <command>ANALYZE</command> and planning cycles + are just wasted. + </para> + </sect3> + </sect2> </sect1> diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml index ef643ad0646..4b1d3f4952e 100644 --- a/doc/src/sgml/planstats.sgml +++ b/doc/src/sgml/planstats.sgml @@ -455,7 +455,7 @@ rows = (outer_cardinality * inner_cardinality) * selectivity <secondary>multivariate</secondary> </indexterm> - <sect2> + <sect2 id="functional-dependencies"> <title>Functional Dependencies</title> <para> @@ -540,7 +540,7 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1; </para> </sect2> - <sect2> + <sect2 id="multivariate-ndistinct-counts"> <title>Multivariate N-Distinct Counts</title> <para> @@ -585,6 +585,118 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b; </para> </sect2> + + <sect2 id="mcv-lists"> + <title>MCV lists</title> + + <para> + As explained in <xref linkend="functional-dependencies"/>, functional + dependencies are very cheap and efficient type of statistics, but their + main limitation is their global nature (only tracking dependencies at + the column level, not between individual column values). + </para> + + <para> + This section introduces multivariate variant of <acronym>MCV</acronym> + (most-common values) lists, a straightforward extension of the per-column + statistics described in <xref linkend="row-estimation-examples"/>. These + statistics address the limitation by storing individual values, but it is + naturally more expensive, both in terms of building the statistics in + <command>ANALYZE</command>, storage and planning time. + </para> + + <para> + Let's look at the query from <xref linkend="functional-dependencies"/> + again, but this time with a <acronym>MCV</acronym> list created on the + same set of columns (be sure to drop the functional dependencies, to + make sure the planner uses the newly created statistics). + +<programlisting> +DROP STATISTICS stts; +CREATE STATISTICS stts2 (mcv) ON a, b FROM t; +ANALYZE t; +EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1; + QUERY PLAN +------------------------------------------------------------------------------- + Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual rows=100 loops=1) + Filter: ((a = 1) AND (b = 1)) + Rows Removed by Filter: 9900 +</programlisting> + + The estimate is as accurate as with the functional dependencies, mostly + thanks to the table being fairly small and having a simple distribution + with a low number of distinct values. Before looking at the second query, + which was not handled by functional dependencies particularly well, + let's inspect the <acronym>MCV</acronym> list a bit. + </para> + + <para> + Inspecting the <acronym>MCV</acronym> list is possible using + <function>pg_mcv_list_items</function> set-returning function. + +<programlisting> +SELECT m.* FROM pg_statistic_ext, + pg_mcv_list_items(stxmcv) m WHERE stxname = 'stts2'; + index | values | nulls | frequency | base_frequency +-------+----------+-------+-----------+---------------- + 0 | {0, 0} | {f,f} | 0.01 | 0.0001 + 1 | {1, 1} | {f,f} | 0.01 | 0.0001 + ... + 49 | {49, 49} | {f,f} | 0.01 | 0.0001 + 50 | {50, 50} | {f,f} | 0.01 | 0.0001 + ... + 97 | {97, 97} | {f,f} | 0.01 | 0.0001 + 98 | {98, 98} | {f,f} | 0.01 | 0.0001 + 99 | {99, 99} | {f,f} | 0.01 | 0.0001 +(100 rows) +</programlisting> + + This confirms there are 100 distinct combinations in the two columns, and + all of them are about equally likely (1% frequency for each one). The + base frequency is the frequency computed from per-column statistics, as if + there were no multi-column statistics. Had there been any null values in + either of the columns, this would be identified in the + <structfield>nulls</structfield> column. + </para> + + <para> + When estimating the selectivity, the planner applies all the conditions + on items in the <acronym>MCV</acronym> list, and then sums the frequencies + of the matching ones. See <function>mcv_clauselist_selectivity</function> + in <filename>src/backend/statistics/mcv.c</filename> for details. + </para> + + <para> + Compared to functional dependencies, <acronym>MCV</acronym> lists have two + major advantages. Firstly, the list stores actual values, making it possible + to decide which combinations are compatible. + +<programlisting> +EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 10; + QUERY PLAN +--------------------------------------------------------------------------- + Seq Scan on t (cost=0.00..195.00 rows=1 width=8) (actual rows=0 loops=1) + Filter: ((a = 1) AND (b = 10)) + Rows Removed by Filter: 10000 +</programlisting> + + Secondly, <acronym>MCV</acronym> lists handle a wider range of clause types, + not just equality clauses like functional dependencies. See for example the + example range query, presented earlier: + +<programlisting> +EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a <= 49 AND b > 49; + QUERY PLAN +--------------------------------------------------------------------------- + Seq Scan on t (cost=0.00..195.00 rows=1 width=8) (actual rows=0 loops=1) + Filter: ((a <= 49) AND (b > 49)) + Rows Removed by Filter: 10000 +</programlisting> + + </para> + + </sect2> + </sect1> <sect1 id="planner-stats-security"> diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml index 539f5bded54..ae1d8024a4e 100644 --- a/doc/src/sgml/ref/create_statistics.sgml +++ b/doc/src/sgml/ref/create_statistics.sgml @@ -81,9 +81,10 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na <para> A statistics kind to be computed in this statistics object. Currently supported kinds are - <literal>ndistinct</literal>, which enables n-distinct statistics, and + <literal>ndistinct</literal>, which enables n-distinct statistics, <literal>dependencies</literal>, which enables functional - dependency statistics. + dependency statistics, and <literal>mcv</literal> which enables + most-common values lists. If this clause is omitted, all supported statistics kinds are included in the statistics object. For more information, see <xref linkend="planner-stats-extended"/> @@ -164,6 +165,36 @@ EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0); conditions are redundant and does not underestimate the row count. </para> + <para> + Create table <structname>t2</structname> with two perfectly correlated columns + (containing identical data), and a MCV list on those columns: + +<programlisting> +CREATE TABLE t2 ( + a int, + b int +); + +INSERT INTO t2 SELECT mod(i,100), mod(i,100) + FROM generate_series(1,1000000) s(i); + +CREATE STATISTICS s2 (mcv) ON (a, b) FROM t2; + +ANALYZE t2; + +-- valid combination (found in MCV) +EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 1); + +-- invalid combination (not found in MCV) +EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 2); +</programlisting> + + The MCV list gives the planner more detailed information about the + specific values that commonly appear in the table, as well as an upper + bound on the selectivities of combinations of values that do not appear + in the table, allowing it to generate better estimates in both cases. + </para> + </refsect1> <refsect1> |