Collect and use element-frequency statistics for arrays.

This patch improves selectivity estimation for the array <@, &&, and @> (containment and overlaps) operators. It enables collection of statistics about individual array element values by ANALYZE, and introduces operator-specific estimators that use these stats. In addition, ScalarArrayOpExpr constructs of the forms "const = ANY/ALL (array_column)" and "const <> ANY/ALL (array_column)" are estimated by treating them as variants of the containment operators. Since we still collect scalar-style stats about the array values as a whole, the pg_stats view is expanded to show both these stats and the array-style stats in separate columns. This creates an incompatible change in how stats for tsvector columns are displayed in pg_stats: the stats about lexemes are now displayed in the array-related columns instead of the original scalar-related columns. There are a few loose ends here, notably that it'd be nice to be able to suppress either the scalar-style stats or the array-element stats for columns for which they're not useful. But the patch is in good enough shape to commit for wider testing. Alexander Korotkov, reviewed by Noah Misch and Nathan Boley

Collect and use element-frequency statistics for arrays.
This patch improves selectivity estimation for the array <@, &&, and @> (containment and overlaps) operators. It enables collection of statistics about individual array element values by ANALYZE, and introduces operator-specific estimators that use these stats. In addition, ScalarArrayOpExpr constructs of the forms "const = ANY/ALL (array_column)" and "const <> ANY/ALL (array_column)" are estimated by treating them as variants of the containment operators. Since we still collect scalar-style stats about the array values as a whole, the pg_stats view is expanded to show both these stats and the array-style stats in separate columns. This creates an incompatible change in how stats for tsvector columns are displayed in pg_stats: the stats about lexemes are now displayed in the array-related columns instead of the original scalar-related columns. There are a few loose ends here, notably that it'd be nice to be able to suppress either the scalar-style stats or the array-element stats for columns for which they're not useful. But the patch is in good enough shape to commit for wider testing. Alexander Korotkov, reviewed by Noah Misch and Nathan Boley
0e5e167a · Tom Lane · 34c97844 · 0e5e167a · 0e5e167a · 0e5e167a
24 changed file
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -5354,9 +5354,9 @@
       Column data values of the appropriate kind for the
       <replaceable>N</>th <quote>slot</quote>, or null if the slot
       kind does not store any data values.  Each array's element
-       values are actually of the specific column's data type, so there
-       is no way to define these columns' type more specifically than
-       <type>anyarray</>.
+       values are actually of the specific column's data type, or a related
+       type such as an array's element type, so there is no way to define
+       these columns' type more specifically than <type>anyarray</>.
      </entry>
     </row>
    </tbody>
@@ -8291,8 +8291,6 @@
      <entry>
       A list of the most common values in the column. (Null if
       no values seem to be more common than any others.)
-       For some data types such as <type>tsvector</>, this is a list of
-       the most common element values rather than values of the type itself.
      </entry>
     </row>

@@ -8301,12 +8299,9 @@
      <entry><type>real[]</type></entry>
      <entry></entry>
      <entry>
-       A list of the frequencies of the most common values or elements,
+       A list of the frequencies of the most common values,
       i.e., number of occurrences of each divided by total number of rows.
       (Null when <structfield>most_common_vals</structfield> is.)
-       For some data types such as <type>tsvector</>, it can also store some
-       additional information, making it longer than the
-       <structfield>most_common_vals</> array.
      </entry>
     </row>

@@ -8338,13 +8333,47 @@
       type does not have a <literal>&lt;</> operator.)
      </entry>
     </row>
+
+     <row>
+      <entry><structfield>most_common_elems</structfield></entry>
+      <entry><type>anyarray</type></entry>
+      <entry></entry>
+      <entry>
+       A list of non-null element values most often appearing within values of
+       the column. (Null for scalar types.)
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>most_common_elem_freqs</structfield></entry>
+      <entry><type>real[]</type></entry>
+      <entry></entry>
+      <entry>
+       A list of the frequencies of the most common element values, i.e., the
+       fraction of rows containing at least one instance of the given value.
+       Two or three additional values follow the per-element frequencies;
+       these are the minimum and maximum of the preceding per-element
+       frequencies, and optionally the frequency of null elements.
+       (Null when <structfield>most_common_elems</structfield> is.)
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>elem_count_histogram</structfield></entry>
+      <entry><type>real[]</type></entry>
+      <entry></entry>
+      <entry>
+       A histogram of the counts of distinct non-null element values within the
+       values of the column, followed by the average number of distinct
+       non-null elements.  (Null for scalar types.)
+      </entry>
+     </row>
    </tbody>
   </tgroup>
  </table>

  <para>
-   The maximum number of entries in the <structfield>most_common_vals</>
-   and <structfield>histogram_bounds</> arrays can be set on a
+   The maximum number of entries in the array fields can be controlled on a
   column-by-column basis using the <command>ALTER TABLE SET STATISTICS</>
   command, or globally by setting the
   <xref linkend="guc-default-statistics-target"> run-time parameter.

--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1182,7 +1182,7 @@ heap_create_with_catalog(const char *relname,
 				   F_ARRAY_SEND,	/* array send (bin) proc */
 				   InvalidOid,	/* typmodin procedure - none */
 				   InvalidOid,	/* typmodout procedure - none */
-				   InvalidOid,	/* analyze procedure - default */
+				   F_ARRAY_TYPANALYZE,	/* array analyze procedure */
 				   new_type_oid,	/* array element type - the rowtype */
 				   true,		/* yes, this is an array type */
 				   InvalidOid,	/* this has no array type */

--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -117,29 +117,54 @@ CREATE VIEW pg_stats AS
        stawidth AS avg_width,
        stadistinct AS n_distinct,
        CASE
-            WHEN stakind1 IN (1, 4) THEN stavalues1
-            WHEN stakind2 IN (1, 4) THEN stavalues2
-            WHEN stakind3 IN (1, 4) THEN stavalues3
-            WHEN stakind4 IN (1, 4) THEN stavalues4
+            WHEN stakind1 = 1 THEN stavalues1
+            WHEN stakind2 = 1 THEN stavalues2
+            WHEN stakind3 = 1 THEN stavalues3
+            WHEN stakind4 = 1 THEN stavalues4
+            WHEN stakind5 = 1 THEN stavalues5
        END AS most_common_vals,
        CASE
-            WHEN stakind1 IN (1, 4) THEN stanumbers1
-            WHEN stakind2 IN (1, 4) THEN stanumbers2
-            WHEN stakind3 IN (1, 4) THEN stanumbers3
-            WHEN stakind4 IN (1, 4) THEN stanumbers4
+            WHEN stakind1 = 1 THEN stanumbers1
+            WHEN stakind2 = 1 THEN stanumbers2
+            WHEN stakind3 = 1 THEN stanumbers3
+            WHEN stakind4 = 1 THEN stanumbers4
+            WHEN stakind5 = 1 THEN stanumbers5
        END AS most_common_freqs,
        CASE
            WHEN stakind1 = 2 THEN stavalues1
            WHEN stakind2 = 2 THEN stavalues2
            WHEN stakind3 = 2 THEN stavalues3
            WHEN stakind4 = 2 THEN stavalues4
+            WHEN stakind5 = 2 THEN stavalues5
        END AS histogram_bounds,
        CASE
            WHEN stakind1 = 3 THEN stanumbers1[1]
            WHEN stakind2 = 3 THEN stanumbers2[1]
            WHEN stakind3 = 3 THEN stanumbers3[1]
            WHEN stakind4 = 3 THEN stanumbers4[1]
-        END AS correlation
+            WHEN stakind5 = 3 THEN stanumbers5[1]
+        END AS correlation,
+        CASE
+            WHEN stakind1 = 4 THEN stavalues1
+            WHEN stakind2 = 4 THEN stavalues2
+            WHEN stakind3 = 4 THEN stavalues3
+            WHEN stakind4 = 4 THEN stavalues4
+            WHEN stakind5 = 4 THEN stavalues5
+        END AS most_common_elems,
+        CASE
+            WHEN stakind1 = 4 THEN stanumbers1
+            WHEN stakind2 = 4 THEN stanumbers2
+            WHEN stakind3 = 4 THEN stanumbers3
+            WHEN stakind4 = 4 THEN stanumbers4
+            WHEN stakind5 = 4 THEN stanumbers5
+        END AS most_common_elem_freqs,
+        CASE
+            WHEN stakind1 = 5 THEN stanumbers1
+            WHEN stakind2 = 5 THEN stanumbers2
+            WHEN stakind3 = 5 THEN stanumbers3
+            WHEN stakind4 = 5 THEN stanumbers4
+            WHEN stakind5 = 5 THEN stanumbers5
+        END AS elem_count_histogram
    FROM pg_statistic s JOIN pg_class c ON (c.oid = s.starelid)
         JOIN pg_attribute a ON (c.oid = attrelid AND attnum = s.staattnum)
         LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace)

--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -110,8 +110,6 @@ static void update_attstats(Oid relid, bool inh,
 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);

-static bool std_typanalyze(VacAttrStats *stats);
-

 /*
 *	analyze_rel() -- analyze one relation
@@ -476,8 +474,7 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, bool inh)
 		for (i = 0; i < attr_cnt; i++)
 		{
 			VacAttrStats *stats = vacattrstats[i];
-			AttributeOpts *aopt =
-			get_attribute_options(onerel->rd_id, stats->attr->attnum);
+			AttributeOpts *aopt;

 			stats->rows = rows;
 			stats->tupDesc = onerel->rd_att;
@@ -490,11 +487,12 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, bool inh)
 			 * If the appropriate flavor of the n_distinct option is
 			 * specified, override with the corresponding value.
 			 */
+			aopt = get_attribute_options(onerel->rd_id, stats->attr->attnum);
 			if (aopt != NULL)
 			{
-				float8		n_distinct =
-				inh ? aopt->n_distinct_inherited : aopt->n_distinct;
+				float8		n_distinct;

+				n_distinct = inh ? aopt->n_distinct_inherited : aopt->n_distinct;
 				if (n_distinct != 0.0)
 					stats->stadistinct = n_distinct;
 			}
@@ -1794,7 +1792,7 @@ static int	compare_mcvs(const void *a, const void *b);
 /*
 * std_typanalyze -- the default type-specific typanalyze function
 */
-static bool
+bool
 std_typanalyze(VacAttrStats *stats)
 {
 	Form_pg_attribute attr = stats->attr;

--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -609,7 +609,7 @@ DefineType(List *names, List *parameters)
 			   F_ARRAY_SEND,	/* send procedure */
 			   typmodinOid,		/* typmodin procedure */
 			   typmodoutOid,	/* typmodout procedure */
-			   InvalidOid,		/* analyze procedure - default */
+			   F_ARRAY_TYPANALYZE,	/* analyze procedure */
 			   typoid,			/* element type ID */
 			   true,			/* yes this is an array type */
 			   InvalidOid,		/* no further array type */
@@ -1140,7 +1140,7 @@ DefineEnum(CreateEnumStmt *stmt)
 			   F_ARRAY_SEND,	/* send procedure */
 			   InvalidOid,		/* typmodin procedure - none */
 			   InvalidOid,		/* typmodout procedure - none */
-			   InvalidOid,		/* analyze procedure - default */
+			   F_ARRAY_TYPANALYZE,	/* analyze procedure */
 			   enumTypeOid,		/* element type ID */
 			   true,			/* yes this is an array type */
 			   InvalidOid,		/* no further array type */
@@ -1450,7 +1450,7 @@ DefineRange(CreateRangeStmt *stmt)
 			   F_ARRAY_SEND,	/* send procedure */
 			   InvalidOid,		/* typmodin procedure - none */
 			   InvalidOid,		/* typmodout procedure - none */
-			   InvalidOid,		/* analyze procedure - default */
+			   F_ARRAY_TYPANALYZE,	/* analyze procedure */
 			   typoid,			/* element type ID */
 			   true,			/* yes this is an array type */
 			   InvalidOid,		/* no further array type */

--- a/src/backend/tsearch/ts_selfuncs.c
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -220,6 +220,10 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
 	/*
 	 * There should be two more Numbers than Values, because the last two
 	 * cells are taken for minimal and maximal frequency.  Punt if not.
+	 *
+	 * (Note: the MCELEM statistics slot definition allows for a third extra
+	 * number containing the frequency of nulls, but we're not expecting that
+	 * to appear for a tsvector column.)
 	 */
 	if (nnumbers != nmcelem + 2)
 		return tsquery_opr_selec_no_stats(query);

--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -377,6 +377,11 @@ compute_tsvector_stats(VacAttrStats *stats,
 			 * able to find out the minimal and maximal frequency without
 			 * going through all the values.  We keep those two extra
 			 * frequencies in two extra cells in mcelem_freqs.
+			 *
+			 * (Note: the MCELEM statistics slot definition allows for a third
+			 * extra number containing the frequency of nulls, but we don't
+			 * create that for a tsvector column, since null elements aren't
+			 * possible.)
 			 */
 			mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
 			mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));

--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -15,7 +15,8 @@ override CFLAGS+= -mieee
 endif
 endif

-OBJS = acl.o arrayfuncs.o array_userfuncs.o arrayutils.o bool.o \
+OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \
+	array_userfuncs.o arrayutils.o bool.o \
 	cash.o char.o date.o datetime.o datum.o domains.o \
 	enum.o float.o format_type.o \
 	geo_ops.o geo_selfuncs.o int.o int8.o json.o like.o lockfuncs.o \

--- a/src/backend/utils/adt/array_selfuncs.c
+++ b/src/backend/utils/adt/array_selfuncs.c
--- a/src/backend/utils/adt/array_typanalyze.c
+++ b/src/backend/utils/adt/array_typanalyze.c
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -127,6 +127,7 @@
 #include "utils/syscache.h"
 #include "utils/timestamp.h"
 #include "utils/tqual.h"
+#include "utils/typcache.h"


 /* Hooks for plugins to get control when we ask for stats */
@@ -1701,27 +1702,18 @@ scalararraysel(PlannerInfo *root,
 {
 	Oid			operator = clause->opno;
 	bool		useOr = clause->useOr;
+	bool		isEquality = false;
+	bool		isInequality = false;
 	Node	   *leftop;
 	Node	   *rightop;
 	Oid			nominal_element_type;
 	Oid			nominal_element_collation;
+	TypeCacheEntry *typentry;
 	RegProcedure oprsel;
 	FmgrInfo	oprselproc;
 	Selectivity s1;

-	/*
-	 * First, look up the underlying operator's selectivity estimator. Punt if
-	 * it hasn't got one.
-	 */
-	if (is_join_clause)
-		oprsel = get_oprjoin(operator);
-	else
-		oprsel = get_oprrest(operator);
-	if (!oprsel)
-		return (Selectivity) 0.5;
-	fmgr_info(oprsel, &oprselproc);
-
-	/* deconstruct the expression */
+	/* First, deconstruct the expression */
 	Assert(list_length(clause->args) == 2);
 	leftop = (Node *) linitial(clause->args);
 	rightop = (Node *) lsecond(clause->args);
@@ -1736,6 +1728,46 @@ scalararraysel(PlannerInfo *root,
 	/* look through any binary-compatible relabeling of rightop */
 	rightop = strip_array_coercion(rightop);

+	/*
+	 * Detect whether the operator is the default equality or inequality
+	 * operator of the array element type.
+	 */
+	typentry = lookup_type_cache(nominal_element_type, TYPECACHE_EQ_OPR);
+	if (OidIsValid(typentry->eq_opr))
+	{
+		if (operator == typentry->eq_opr)
+			isEquality = true;
+		else if (get_negator(operator) == typentry->eq_opr)
+			isInequality = true;
+	}
+
+	/*
+	 * If it is equality or inequality, we might be able to estimate this as
+	 * a form of array containment; for instance "const = ANY(column)" can be
+	 * treated as "ARRAY[const] <@ column".  scalararraysel_containment tries
+	 * that, and returns the selectivity estimate if successful, or -1 if not.
+	 */
+	if ((isEquality || isInequality) && !is_join_clause)
+	{
+		s1 = scalararraysel_containment(root, leftop, rightop,
+										nominal_element_type,
+										isEquality, useOr, varRelid);
+		if (s1 >= 0.0)
+			return s1;
+	}
+
+	/*
+	 * Look up the underlying operator's selectivity estimator. Punt if it
+	 * hasn't got one.
+	 */
+	if (is_join_clause)
+		oprsel = get_oprjoin(operator);
+	else
+		oprsel = get_oprrest(operator);
+	if (!oprsel)
+		return (Selectivity) 0.5;
+	fmgr_info(oprsel, &oprselproc);
+
 	/*
 	 * We consider three cases:
 	 *

--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
 */

 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201203021
+#define CATALOG_VERSION_NO	201203031

 #endif
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -1520,12 +1520,15 @@ DATA(insert OID = 2590 (  "|&>"    PGNSP PGUID b f f 718 718	16	 0	 0 circle_ove
 DESCR("overlaps or is above");

 /* overlap/contains/contained for arrays */
-DATA(insert OID = 2750 (  "&&"	   PGNSP PGUID b f f 2277 2277	16 2750  0 arrayoverlap areasel areajoinsel ));
+DATA(insert OID = 2750 (  "&&"	   PGNSP PGUID b f f 2277 2277	16 2750  0 arrayoverlap arraycontsel arraycontjoinsel ));
 DESCR("overlaps");
-DATA(insert OID = 2751 (  "@>"	   PGNSP PGUID b f f 2277 2277	16 2752  0 arraycontains contsel contjoinsel ));
+#define OID_ARRAY_OVERLAP_OP	2750
+DATA(insert OID = 2751 (  "@>"	   PGNSP PGUID b f f 2277 2277	16 2752  0 arraycontains arraycontsel arraycontjoinsel ));
 DESCR("contains");
-DATA(insert OID = 2752 (  "<@"	   PGNSP PGUID b f f 2277 2277	16 2751  0 arraycontained contsel contjoinsel ));
+#define OID_ARRAY_CONTAINS_OP	2751
+DATA(insert OID = 2752 (  "<@"	   PGNSP PGUID b f f 2277 2277	16 2751  0 arraycontained arraycontsel arraycontjoinsel ));
 DESCR("is contained by");
+#define OID_ARRAY_CONTAINED_OP	2752

 /* capturing operators to preserve pre-8.3 behavior of text concatenation */
 DATA(insert OID = 2779 (  "||"	   PGNSP PGUID b f f 25 2776	25	 0 0 textanycat - - ));

--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -869,6 +869,12 @@ DATA(insert OID = 2334 (  array_agg_finalfn   PGNSP PGUID 12 1 0 0 0 f f f f f f
 DESCR("aggregate final function");
 DATA(insert OID = 2335 (  array_agg		   PGNSP PGUID 12 1 0 0 0 t f f f f f i 1 0 2277 "2283" _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
 DESCR("concatenate aggregate input into an array");
+DATA(insert OID = 3816 (  array_typanalyze PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ array_typanalyze _null_ _null_ _null_ ));
+DESCR("array typanalyze");
+DATA(insert OID = 3817 (  arraycontsel	   PGNSP PGUID 12 1 0 0 0 f f f f t f s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ arraycontsel _null_ _null_ _null_ ));
+DESCR("restriction selectivity for array-containment operators");
+DATA(insert OID = 3818 (  arraycontjoinsel PGNSP PGUID 12 1 0 0 0 f f f f t f s 5 0 701 "2281 26 2281 21 2281" _null_ _null_ _null_ _null_ arraycontjoinsel _null_ _null_ _null_ ));
+DESCR("join selectivity for array-containment operators");

 DATA(insert OID = 760 (  smgrin			   PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 210 "2275" _null_ _null_ _null_ _null_	smgrin _null_ _null_ _null_ ));
 DESCR("I/O");

--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -21,16 +21,6 @@

 #include "catalog/genbki.h"

-/*
- * The CATALOG definition has to refer to the type of stavaluesN as
- * "anyarray" so that bootstrap mode recognizes it.  There is no real
- * typedef for that, however.  Since the fields are potentially-null and
- * therefore can't be accessed directly from C code, there is no particular
- * need for the C struct definition to show a valid field type --- instead
- * we just make it int.
- */
-#define anyarray int
-
 /* ----------------
 *		pg_statistic definition.  cpp turns this into
 *		typedef struct FormData_pg_statistic
@@ -83,7 +73,7 @@ CATALOG(pg_statistic,2619) BKI_WITHOUT_OIDS
 	 * we do not hard-wire any particular meaning for the remaining
 	 * statistical fields.	Instead, we provide several "slots" in which
 	 * statistical data can be placed.	Each slot includes:
-	 *		kind			integer code identifying kind of data
+	 *		kind			integer code identifying kind of data (see below)
 	 *		op				OID of associated operator, if needed
 	 *		numbers			float4 array (for statistical values)
 	 *		values			anyarray (for representations of data values)
@@ -98,40 +88,36 @@ CATALOG(pg_statistic,2619) BKI_WITHOUT_OIDS
 	int2		stakind2;
 	int2		stakind3;
 	int2		stakind4;
+	int2		stakind5;

 	Oid			staop1;
 	Oid			staop2;
 	Oid			staop3;
 	Oid			staop4;
+	Oid			staop5;

-	/*
-	 * THE REST OF THESE ARE VARIABLE LENGTH FIELDS, and may even be absent
-	 * (NULL). They cannot be accessed as C struct entries; you have to use
-	 * the full field access machinery (heap_getattr) for them.  We declare
-	 * them here for the catalog machinery.
-	 */
-
+#ifdef CATALOG_VARLEN			/* variable-length fields start here */
 	float4		stanumbers1[1];
 	float4		stanumbers2[1];
 	float4		stanumbers3[1];
 	float4		stanumbers4[1];
+	float4		stanumbers5[1];

-#ifdef CATALOG_VARLEN			/* variable-length fields start here */
 	/*
-	 * Values in these arrays are values of the column's data type.  We
-	 * presently have to cheat quite a bit to allow polymorphic arrays of this
-	 * kind, but perhaps someday it'll be a less bogus facility.
+	 * Values in these arrays are values of the column's data type, or of some
+	 * related type such as an array element type.  We presently have to cheat
+	 * quite a bit to allow polymorphic arrays of this kind, but perhaps
+	 * someday it'll be a less bogus facility.
 	 */
 	anyarray	stavalues1;
 	anyarray	stavalues2;
 	anyarray	stavalues3;
 	anyarray	stavalues4;
+	anyarray	stavalues5;
 #endif
 } FormData_pg_statistic;

-#define STATISTIC_NUM_SLOTS  4
-
-#undef anyarray
+#define STATISTIC_NUM_SLOTS  5


 /* ----------------
@@ -145,7 +131,7 @@ typedef FormData_pg_statistic *Form_pg_statistic;
 *		compiler constants for pg_statistic
 * ----------------
 */
-#define Natts_pg_statistic				22
+#define Natts_pg_statistic				26
 #define Anum_pg_statistic_starelid		1
 #define Anum_pg_statistic_staattnum		2
 #define Anum_pg_statistic_stainherit	3
@@ -156,22 +142,26 @@ typedef FormData_pg_statistic *Form_pg_statistic;
 #define Anum_pg_statistic_stakind2		8
 #define Anum_pg_statistic_stakind3		9
 #define Anum_pg_statistic_stakind4		10
-#define Anum_pg_statistic_staop1		11
-#define Anum_pg_statistic_staop2		12
-#define Anum_pg_statistic_staop3		13
-#define Anum_pg_statistic_staop4		14
-#define Anum_pg_statistic_stanumbers1	15
-#define Anum_pg_statistic_stanumbers2	16
-#define Anum_pg_statistic_stanumbers3	17
-#define Anum_pg_statistic_stanumbers4	18
-#define Anum_pg_statistic_stavalues1	19
-#define Anum_pg_statistic_stavalues2	20
-#define Anum_pg_statistic_stavalues3	21
-#define Anum_pg_statistic_stavalues4	22
+#define Anum_pg_statistic_stakind5		11
+#define Anum_pg_statistic_staop1		12
+#define Anum_pg_statistic_staop2		13
+#define Anum_pg_statistic_staop3		14
+#define Anum_pg_statistic_staop4		15
+#define Anum_pg_statistic_staop5		16
+#define Anum_pg_statistic_stanumbers1	17
+#define Anum_pg_statistic_stanumbers2	18
+#define Anum_pg_statistic_stanumbers3	19
+#define Anum_pg_statistic_stanumbers4	20
+#define Anum_pg_statistic_stanumbers5	21
+#define Anum_pg_statistic_stavalues1	22
+#define Anum_pg_statistic_stavalues2	23
+#define Anum_pg_statistic_stavalues3	24
+#define Anum_pg_statistic_stavalues4	25
+#define Anum_pg_statistic_stavalues5	26

 /*
- * Currently, three statistical slot "kinds" are defined: most common values,
- * histogram, and correlation.	Additional "kinds" will probably appear in
+ * Currently, five statistical slot "kinds" are defined by core PostgreSQL,
+ * as documented below.  Additional "kinds" will probably appear in
 * future to help cope with non-scalar datatypes.  Also, custom data types
 * can define their own "kind" codes by mutual agreement between a custom
 * typanalyze routine and the selectivity estimation functions of the type's
@@ -250,11 +240,14 @@ typedef FormData_pg_statistic *Form_pg_statistic;
 * the most common element values, and stanumbers their frequencies.  Unlike
 * MCV slots, frequencies are measured as the fraction of non-null rows the
 * element value appears in, not the frequency of all rows.  Also unlike
- * MCV slots, the values are sorted into order (to support binary search
- * for a particular value).  Since this puts the minimum and maximum
- * frequencies at unpredictable spots in stanumbers, there are two extra
- * members of stanumbers, holding copies of the minimum and maximum
- * frequencies.
+ * MCV slots, the values are sorted into the element type's default order
+ * (to support binary search for a particular value).  Since this puts the
+ * minimum and maximum frequencies at unpredictable spots in stanumbers,
+ * there are two extra members of stanumbers, holding copies of the minimum
+ * and maximum frequencies.  Optionally, there can be a third extra member,
+ * which holds the frequency of null elements (expressed in the same terms:
+ * the fraction of non-null rows that contain at least one null element).  If
+ * this member is omitted, the column is presumed to contain no null elements.
 *
 * Note: in current usage for tsvector columns, the stavalues elements are of
 * type text, even though their representation within tsvector is not
@@ -262,4 +255,17 @@ typedef FormData_pg_statistic *Form_pg_statistic;
 */
 #define STATISTIC_KIND_MCELEM  4

+/*
+ * A "distinct elements count histogram" slot describes the distribution of
+ * the number of distinct element values present in each row of an array-type
+ * column.  Only non-null rows are considered, and only non-null elements.
+ * staop contains the equality operator appropriate to the element type.
+ * stavalues is not used and should be NULL.  The last member of stanumbers is
+ * the average count of distinct element values over all non-null rows.  The
+ * preceding M (>=2) members form a histogram that divides the population of
+ * distinct-elements counts into M-1 bins of approximately equal population.
+ * The first of these is the minimum observed count, and the last the maximum.
+ */
+#define STATISTIC_KIND_DECHIST  5
+
 #endif   /* PG_STATISTIC_H */
--- a/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -61,6 +61,11 @@ typedef struct VacAttrStats *VacAttrStatsP;
 typedef Datum (*AnalyzeAttrFetchFunc) (VacAttrStatsP stats, int rownum,
 												   bool *isNull);

+typedef void (*AnalyzeAttrComputeStatsFunc) (VacAttrStatsP stats,
+											 AnalyzeAttrFetchFunc fetchfunc,
+											 int samplerows,
+											 double totalrows);
+
 typedef struct VacAttrStats
 {
 	/*
@@ -83,10 +88,7 @@ typedef struct VacAttrStats
 	 * These fields must be filled in by the typanalyze routine, unless it
 	 * returns FALSE.
 	 */
-	void		(*compute_stats) (VacAttrStatsP stats,
-											  AnalyzeAttrFetchFunc fetchfunc,
-											  int samplerows,
-											  double totalrows);
+	AnalyzeAttrComputeStatsFunc compute_stats;	/* function pointer */
 	int			minrows;		/* Minimum # of rows wanted for stats */
 	void	   *extra_data;		/* for extra type-specific data */

@@ -167,5 +169,6 @@ extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 /* in commands/analyze.c */
 extern void analyze_rel(Oid relid, VacuumStmt *vacstmt,
 			BufferAccessStrategy bstrategy);
+extern bool std_typanalyze(VacAttrStats *stats);

 #endif   /* VACUUM_H */
--- a/src/include/utils/array.h
+++ b/src/include/utils/array.h
@@ -289,4 +289,9 @@ extern ArrayType *create_singleton_array(FunctionCallInfo fcinfo,
 extern Datum array_agg_transfn(PG_FUNCTION_ARGS);
 extern Datum array_agg_finalfn(PG_FUNCTION_ARGS);

+/*
+ * prototypes for functions defined in array_typanalyze.c
+ */
+extern Datum array_typanalyze(PG_FUNCTION_ARGS);
+
 #endif   /* ARRAY_H */
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -95,9 +95,6 @@ typedef enum
 	Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact
 } Pattern_Prefix_Status;

-
-/* selfuncs.c */
-
 /* Hooks for plugins to get control when we ask for stats */
 typedef bool (*get_relation_stats_hook_type) (PlannerInfo *root,
 														  RangeTblEntry *rte,
@@ -110,6 +107,8 @@ typedef bool (*get_index_stats_hook_type) (PlannerInfo *root,
 												  VariableStatData *vardata);
 extern PGDLLIMPORT get_index_stats_hook_type get_index_stats_hook;

+/* Functions in selfuncs.c */
+
 extern void examine_variable(PlannerInfo *root, Node *node, int varRelid,
 				 VariableStatData *vardata);
 extern bool get_restriction_variable(PlannerInfo *root, List *args,
@@ -197,4 +196,13 @@ extern Datum gistcostestimate(PG_FUNCTION_ARGS);
 extern Datum spgcostestimate(PG_FUNCTION_ARGS);
 extern Datum gincostestimate(PG_FUNCTION_ARGS);

+/* Functions in array_selfuncs.c */
+
+extern Selectivity scalararraysel_containment(PlannerInfo *root,
+						   Node *leftop, Node *rightop,
+						   Oid elemtype, bool isEquality, bool useOr,
+						   int varRelid);
+extern Datum arraycontsel(PG_FUNCTION_ARGS);
+extern Datum arraycontjoinsel(PG_FUNCTION_ARGS);
+
 #endif   /* SELFUNCS_H */
--- a/src/test/regress/expected/arrays.out
+++ b/src/test/regress/expected/arrays.out
@@ -421,6 +421,7 @@ SELECT 0 || ARRAY[1,2] || 3 AS "{0,1,2,3}";
 {0,1,2,3}
 (1 row)

+ANALYZE array_op_test;
 SELECT * FROM array_op_test WHERE i @> '{32}' ORDER BY seqno;
 seqno |                i                |                                                                 t                                                                  
 -------+---------------------------------+------------------------------------------------------------------------------------------------------------------------------------

--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1317,7 +1317,7 @@ SELECT viewname, definition FROM pg_views WHERE schemaname <> 'information_schem
 pg_statio_user_indexes          | SELECT pg_statio_all_indexes.relid, pg_statio_all_indexes.indexrelid, pg_statio_all_indexes.schemaname, pg_statio_all_indexes.relname, pg_statio_all_indexes.indexrelname, pg_statio_all_indexes.idx_blks_read, pg_statio_all_indexes.idx_blks_hit FROM pg_statio_all_indexes WHERE ((pg_statio_all_indexes.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_statio_all_indexes.schemaname !~ '^pg_toast'::text));
 pg_statio_user_sequences        | SELECT pg_statio_all_sequences.relid, pg_statio_all_sequences.schemaname, pg_statio_all_sequences.relname, pg_statio_all_sequences.blks_read, pg_statio_all_sequences.blks_hit FROM pg_statio_all_sequences WHERE ((pg_statio_all_sequences.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_statio_all_sequences.schemaname !~ '^pg_toast'::text));
 pg_statio_user_tables           | SELECT pg_statio_all_tables.relid, pg_statio_all_tables.schemaname, pg_statio_all_tables.relname, pg_statio_all_tables.heap_blks_read, pg_statio_all_tables.heap_blks_hit, pg_statio_all_tables.idx_blks_read, pg_statio_all_tables.idx_blks_hit, pg_statio_all_tables.toast_blks_read, pg_statio_all_tables.toast_blks_hit, pg_statio_all_tables.tidx_blks_read, pg_statio_all_tables.tidx_blks_hit FROM pg_statio_all_tables WHERE ((pg_statio_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_statio_all_tables.schemaname !~ '^pg_toast'::text));
- pg_stats                        | SELECT n.nspname AS schemaname, c.relname AS tablename, a.attname, s.stainherit AS inherited, s.stanullfrac AS null_frac, s.stawidth AS avg_width, s.stadistinct AS n_distinct, CASE WHEN (s.stakind1 = ANY (ARRAY[1, 4])) THEN s.stavalues1 WHEN (s.stakind2 = ANY (ARRAY[1, 4])) THEN s.stavalues2 WHEN (s.stakind3 = ANY (ARRAY[1, 4])) THEN s.stavalues3 WHEN (s.stakind4 = ANY (ARRAY[1, 4])) THEN s.stavalues4 ELSE NULL::anyarray END AS most_common_vals, CASE WHEN (s.stakind1 = ANY (ARRAY[1, 4])) THEN s.stanumbers1 WHEN (s.stakind2 = ANY (ARRAY[1, 4])) THEN s.stanumbers2 WHEN (s.stakind3 = ANY (ARRAY[1, 4])) THEN s.stanumbers3 WHEN (s.stakind4 = ANY (ARRAY[1, 4])) THEN s.stanumbers4 ELSE NULL::real[] END AS most_common_freqs, CASE WHEN (s.stakind1 = 2) THEN s.stavalues1 WHEN (s.stakind2 = 2) THEN s.stavalues2 WHEN (s.stakind3 = 2) THEN s.stavalues3 WHEN (s.stakind4 = 2) THEN s.stavalues4 ELSE NULL::anyarray END AS histogram_bounds, CASE WHEN (s.stakind1 = 3) THEN s.stanumbers1[1] WHEN (s.stakind2 = 3) THEN s.stanumbers2[1] WHEN (s.stakind3 = 3) THEN s.stanumbers3[1] WHEN (s.stakind4 = 3) THEN s.stanumbers4[1] ELSE NULL::real END AS correlation FROM (((pg_statistic s JOIN pg_class c ON ((c.oid = s.starelid))) JOIN pg_attribute a ON (((c.oid = a.attrelid) AND (a.attnum = s.staattnum)))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE ((NOT a.attisdropped) AND has_column_privilege(c.oid, a.attnum, 'select'::text));
+ pg_stats                        | SELECT n.nspname AS schemaname, c.relname AS tablename, a.attname, s.stainherit AS inherited, s.stanullfrac AS null_frac, s.stawidth AS avg_width, s.stadistinct AS n_distinct, CASE WHEN (s.stakind1 = 1) THEN s.stavalues1 WHEN (s.stakind2 = 1) THEN s.stavalues2 WHEN (s.stakind3 = 1) THEN s.stavalues3 WHEN (s.stakind4 = 1) THEN s.stavalues4 WHEN (s.stakind5 = 1) THEN s.stavalues5 ELSE NULL::anyarray END AS most_common_vals, CASE WHEN (s.stakind1 = 1) THEN s.stanumbers1 WHEN (s.stakind2 = 1) THEN s.stanumbers2 WHEN (s.stakind3 = 1) THEN s.stanumbers3 WHEN (s.stakind4 = 1) THEN s.stanumbers4 WHEN (s.stakind5 = 1) THEN s.stanumbers5 ELSE NULL::real[] END AS most_common_freqs, CASE WHEN (s.stakind1 = 2) THEN s.stavalues1 WHEN (s.stakind2 = 2) THEN s.stavalues2 WHEN (s.stakind3 = 2) THEN s.stavalues3 WHEN (s.stakind4 = 2) THEN s.stavalues4 WHEN (s.stakind5 = 2) THEN s.stavalues5 ELSE NULL::anyarray END AS histogram_bounds, CASE WHEN (s.stakind1 = 3) THEN s.stanumbers1[1] WHEN (s.stakind2 = 3) THEN s.stanumbers2[1] WHEN (s.stakind3 = 3) THEN s.stanumbers3[1] WHEN (s.stakind4 = 3) THEN s.stanumbers4[1] WHEN (s.stakind5 = 3) THEN s.stanumbers5[1] ELSE NULL::real END AS correlation, CASE WHEN (s.stakind1 = 4) THEN s.stavalues1 WHEN (s.stakind2 = 4) THEN s.stavalues2 WHEN (s.stakind3 = 4) THEN s.stavalues3 WHEN (s.stakind4 = 4) THEN s.stavalues4 WHEN (s.stakind5 = 4) THEN s.stavalues5 ELSE NULL::anyarray END AS most_common_elems, CASE WHEN (s.stakind1 = 4) THEN s.stanumbers1 WHEN (s.stakind2 = 4) THEN s.stanumbers2 WHEN (s.stakind3 = 4) THEN s.stanumbers3 WHEN (s.stakind4 = 4) THEN s.stanumbers4 WHEN (s.stakind5 = 4) THEN s.stanumbers5 ELSE NULL::real[] END AS most_common_elem_freqs, CASE WHEN (s.stakind1 = 5) THEN s.stanumbers1 WHEN (s.stakind2 = 5) THEN s.stanumbers2 WHEN (s.stakind3 = 5) THEN s.stanumbers3 WHEN (s.stakind4 = 5) THEN s.stanumbers4 WHEN (s.stakind5 = 5) THEN s.stanumbers5 ELSE NULL::real[] END AS elem_count_histogram FROM (((pg_statistic s JOIN pg_class c ON ((c.oid = s.starelid))) JOIN pg_attribute a ON (((c.oid = a.attrelid) AND (a.attnum = s.staattnum)))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE ((NOT a.attisdropped) AND has_column_privilege(c.oid, a.attnum, 'select'::text));
 pg_tables                       | SELECT n.nspname AS schemaname, c.relname AS tablename, pg_get_userbyid(c.relowner) AS tableowner, t.spcname AS tablespace, c.relhasindex AS hasindexes, c.relhasrules AS hasrules, c.relhastriggers AS hastriggers FROM ((pg_class c LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) LEFT JOIN pg_tablespace t ON ((t.oid = c.reltablespace))) WHERE (c.relkind = 'r'::"char");
 pg_timezone_abbrevs             | SELECT pg_timezone_abbrevs.abbrev, pg_timezone_abbrevs.utc_offset, pg_timezone_abbrevs.is_dst FROM pg_timezone_abbrevs() pg_timezone_abbrevs(abbrev, utc_offset, is_dst);
 pg_timezone_names               | SELECT pg_timezone_names.name, pg_timezone_names.abbrev, pg_timezone_names.utc_offset, pg_timezone_names.is_dst FROM pg_timezone_names() pg_timezone_names(name, abbrev, utc_offset, is_dst);

--- a/src/test/regress/expected/type_sanity.out
+++ b/src/test/regress/expected/type_sanity.out
@@ -375,6 +375,39 @@ WHERE p1.typanalyze = p2.oid AND NOT
 -----+---------+-----+---------
 (0 rows)

+-- domains inherit their base type's typanalyze
+SELECT d.oid, d.typname, d.typanalyze, t.oid, t.typname, t.typanalyze
+FROM pg_type d JOIN pg_type t ON d.typbasetype = t.oid
+WHERE d.typanalyze != t.typanalyze;
+ oid | typname | typanalyze | oid | typname | typanalyze 
+-----+---------+------------+-----+---------+------------
+(0 rows)
+
+-- range_typanalyze should be used for all and only range types
+-- (but exclude domains, which we checked above)
+SELECT t.oid, t.typname, t.typanalyze
+FROM pg_type t LEFT JOIN pg_range r on t.oid = r.rngtypid
+WHERE t.typbasetype = 0 AND
+    (t.typanalyze = 'range_typanalyze'::regproc) != (r.rngtypid IS NOT NULL);
+ oid | typname | typanalyze 
+-----+---------+------------
+(0 rows)
+
+-- array_typanalyze should be used for all and only array types
+-- (but exclude domains, which we checked above)
+-- As of 9.2 this finds int2vector and oidvector, which are weird anyway
+SELECT t.oid, t.typname, t.typanalyze
+FROM pg_type t
+WHERE t.typbasetype = 0 AND
+    (t.typanalyze = 'array_typanalyze'::regproc) !=
+    (typelem != 0 AND typlen < 0)
+ORDER BY 1;
+ oid |  typname   | typanalyze 
+-----+------------+------------
+  22 | int2vector | -
+  30 | oidvector  | -
+(2 rows)
+
 -- **************** pg_class ****************
 -- Look for illegal values in pg_class fields
 SELECT p1.oid, p1.relname

--- a/src/test/regress/sql/arrays.sql
+++ b/src/test/regress/sql/arrays.sql
@@ -196,6 +196,8 @@ SELECT ARRAY[[1,2],[3,4]] || ARRAY[5,6] AS "{{1,2},{3,4},{5,6}}";
 SELECT ARRAY[0,0] || ARRAY[1,1] || ARRAY[2,2] AS "{0,0,1,1,2,2}";
 SELECT 0 || ARRAY[1,2] || 3 AS "{0,1,2,3}";

+ANALYZE array_op_test;
+
 SELECT * FROM array_op_test WHERE i @> '{32}' ORDER BY seqno;
 SELECT * FROM array_op_test WHERE i && '{32}' ORDER BY seqno;
 SELECT * FROM array_op_test WHERE i @> '{17}' ORDER BY seqno;

--- a/src/test/regress/sql/type_sanity.sql
+++ b/src/test/regress/sql/type_sanity.sql
@@ -272,6 +272,31 @@ WHERE p1.typanalyze = p2.oid AND NOT
     p2.proargtypes[0] = 'internal'::regtype AND
     p2.prorettype = 'bool'::regtype AND NOT p2.proretset);

+-- domains inherit their base type's typanalyze
+
+SELECT d.oid, d.typname, d.typanalyze, t.oid, t.typname, t.typanalyze
+FROM pg_type d JOIN pg_type t ON d.typbasetype = t.oid
+WHERE d.typanalyze != t.typanalyze;
+
+-- range_typanalyze should be used for all and only range types
+-- (but exclude domains, which we checked above)
+
+SELECT t.oid, t.typname, t.typanalyze
+FROM pg_type t LEFT JOIN pg_range r on t.oid = r.rngtypid
+WHERE t.typbasetype = 0 AND
+    (t.typanalyze = 'range_typanalyze'::regproc) != (r.rngtypid IS NOT NULL);
+
+-- array_typanalyze should be used for all and only array types
+-- (but exclude domains, which we checked above)
+-- As of 9.2 this finds int2vector and oidvector, which are weird anyway
+
+SELECT t.oid, t.typname, t.typanalyze
+FROM pg_type t
+WHERE t.typbasetype = 0 AND
+    (t.typanalyze = 'array_typanalyze'::regproc) !=
+    (typelem != 0 AND typlen < 0)
+ORDER BY 1;
+
 -- **************** pg_class ****************

 -- Look for illegal values in pg_class fields