Make planner compute the number of hash buckets the same way that

nodeHash.c will compute it (by sharing code).

Make planner compute the number of hash buckets the same way that
nodeHash.c will compute it (by sharing code).
01a819ab · Tom Lane · ccda1a67 · 01a819ab · 01a819ab · 01a819ab
3 changed file
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
- *	$Id: nodeHash.c,v 1.57 2001/05/27 20:42:18 tgl Exp $
+ *	$Id: nodeHash.c,v 1.58 2001/06/11 00:17:07 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -16,14 +16,12 @@
 *		ExecHash		- generate an in-memory hash table of the relation
 *		ExecInitHash	- initialize node and subnodes
 *		ExecEndHash		- shutdown node and subnodes
- *
 */
+#include "postgres.h"

 #include <sys/types.h>
 #include <math.h>

-#include "postgres.h"
-
 #include "executor/execdebug.h"
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
@@ -209,111 +207,27 @@ ExecEndHash(Hash *node)
 *		create a hashtable in shared memory for hashjoin.
 * ----------------------------------------------------------------
 */
-#define FUDGE_FAC				2.0
-
 HashJoinTable
 ExecHashTableCreate(Hash *node)
 {
-	Plan	   *outerNode;
-	double		ntuples;
-	int			tupsize;
-	double		inner_rel_bytes;
-	double		hash_table_bytes;
-	int			nbatch;
 	HashJoinTable hashtable;
-	int			nbuckets;
+	Plan	   *outerNode;
 	int			totalbuckets;
-	int			bucketsize;
+	int			nbuckets;
+	int			nbatch;
 	int			i;
 	MemoryContext oldcxt;

 	/*
 	 * Get information about the size of the relation to be hashed (it's
 	 * the "outer" subtree of this node, but the inner relation of the
-	 * hashjoin).
-	 *
-	 * Caution: this is only the planner's estimates, and so can't be trusted
-	 * too far.  Apply a healthy fudge factor.
+	 * hashjoin).  Compute the appropriate size of the hash table.
 	 */
 	outerNode = outerPlan(node);
-	ntuples = outerNode->plan_rows;
-	if (ntuples <= 0.0)			/* force a plausible size if no info */
-		ntuples = 1000.0;
-
-	/*
-	 * estimate tupsize based on footprint of tuple in hashtable... but
-	 * what about palloc overhead?
-	 */
-	tupsize = MAXALIGN(outerNode->plan_width) +
-		MAXALIGN(sizeof(HashJoinTupleData));
-	inner_rel_bytes = ntuples * tupsize * FUDGE_FAC;
-
-	/*
-	 * Target hashtable size is SortMem kilobytes, but not less than
-	 * sqrt(estimated inner rel size), so as to avoid horrible
-	 * performance.
-	 */
-	hash_table_bytes = sqrt(inner_rel_bytes);
-	if (hash_table_bytes < (SortMem * 1024L))
-		hash_table_bytes = SortMem * 1024L;
-
-	/*
-	 * Count the number of hash buckets we want for the whole relation,
-	 * for an average bucket load of NTUP_PER_BUCKET (per virtual
-	 * bucket!).
-	 */
-	totalbuckets = (int) ceil(ntuples * FUDGE_FAC / NTUP_PER_BUCKET);
-
-	/*
-	 * Count the number of buckets we think will actually fit in the
-	 * target memory size, at a loading of NTUP_PER_BUCKET (physical
-	 * buckets). NOTE: FUDGE_FAC here determines the fraction of the
-	 * hashtable space reserved to allow for nonuniform distribution of
-	 * hash values. Perhaps this should be a different number from the
-	 * other uses of FUDGE_FAC, but since we have no real good way to pick
-	 * either one...
-	 */
-	bucketsize = NTUP_PER_BUCKET * tupsize;
-	nbuckets = (int) (hash_table_bytes / (bucketsize * FUDGE_FAC));
-	if (nbuckets <= 0)
-		nbuckets = 1;

-	if (totalbuckets <= nbuckets)
-	{
+	ExecChooseHashTableSize(outerNode->plan_rows, outerNode->plan_width,
+							&totalbuckets, &nbuckets, &nbatch);

-		/*
-		 * We have enough space, so no batching.  In theory we could even
-		 * reduce nbuckets, but since that could lead to poor behavior if
-		 * estimated ntuples is much less than reality, it seems better to
-		 * make more buckets instead of fewer.
-		 */
-		totalbuckets = nbuckets;
-		nbatch = 0;
-	}
-	else
-	{
-
-		/*
-		 * Need to batch; compute how many batches we want to use. Note
-		 * that nbatch doesn't have to have anything to do with the ratio
-		 * totalbuckets/nbuckets; in fact, it is the number of groups we
-		 * will use for the part of the data that doesn't fall into the
-		 * first nbuckets hash buckets.
-		 */
-		nbatch = (int) ceil((inner_rel_bytes - hash_table_bytes) /
-							hash_table_bytes);
-		if (nbatch <= 0)
-			nbatch = 1;
-	}
-
-	/*
-	 * Now, totalbuckets is the number of (virtual) hashbuckets for the
-	 * whole relation, and nbuckets is the number of physical hashbuckets
-	 * we will use in the first pass.  Data falling into the first
-	 * nbuckets virtual hashbuckets gets handled in the first pass;
-	 * everything else gets divided into nbatch batches to be processed in
-	 * additional passes.
-	 */
 #ifdef HJDEBUG
 	printf("nbatch = %d, totalbuckets = %d, nbuckets = %d\n",
 		   nbatch, totalbuckets, nbuckets);
@@ -407,6 +321,117 @@ ExecHashTableCreate(Hash *node)
 	return hashtable;
 }

+
+/*
+ * Compute appropriate size for hashtable given the estimated size of the
+ * relation to be hashed (number of rows and average row width).
+ *
+ * Caution: the input is only the planner's estimates, and so can't be
+ * trusted too far.  Apply a healthy fudge factor.
+ *
+ * This is exported so that the planner's costsize.c can use it.
+ */
+
+/* Target bucket loading (tuples per bucket) */
+#define NTUP_PER_BUCKET			10
+/* Fudge factor to allow for inaccuracy of input estimates */
+#define FUDGE_FAC				2.0
+
+void
+ExecChooseHashTableSize(double ntuples, int tupwidth,
+						int *virtualbuckets,
+						int *physicalbuckets,
+						int *numbatches)
+{
+	int			tupsize;
+	double		inner_rel_bytes;
+	double		hash_table_bytes;
+	int			nbatch;
+	int			nbuckets;
+	int			totalbuckets;
+	int			bucketsize;
+
+	/* Force a plausible relation size if no info */
+	if (ntuples <= 0.0)
+		ntuples = 1000.0;
+
+	/*
+	 * Estimate tupsize based on footprint of tuple in hashtable... but
+	 * what about palloc overhead?
+	 */
+	tupsize = MAXALIGN(tupwidth) + MAXALIGN(sizeof(HashJoinTupleData));
+	inner_rel_bytes = ntuples * tupsize * FUDGE_FAC;
+
+	/*
+	 * Target hashtable size is SortMem kilobytes, but not less than
+	 * sqrt(estimated inner rel size), so as to avoid horrible
+	 * performance.
+	 */
+	hash_table_bytes = sqrt(inner_rel_bytes);
+	if (hash_table_bytes < (SortMem * 1024L))
+		hash_table_bytes = SortMem * 1024L;
+
+	/*
+	 * Count the number of hash buckets we want for the whole relation,
+	 * for an average bucket load of NTUP_PER_BUCKET (per virtual
+	 * bucket!).
+	 */
+	totalbuckets = (int) ceil(ntuples * FUDGE_FAC / NTUP_PER_BUCKET);
+
+	/*
+	 * Count the number of buckets we think will actually fit in the
+	 * target memory size, at a loading of NTUP_PER_BUCKET (physical
+	 * buckets). NOTE: FUDGE_FAC here determines the fraction of the
+	 * hashtable space reserved to allow for nonuniform distribution of
+	 * hash values. Perhaps this should be a different number from the
+	 * other uses of FUDGE_FAC, but since we have no real good way to pick
+	 * either one...
+	 */
+	bucketsize = NTUP_PER_BUCKET * tupsize;
+	nbuckets = (int) (hash_table_bytes / (bucketsize * FUDGE_FAC));
+	if (nbuckets <= 0)
+		nbuckets = 1;
+
+	if (totalbuckets <= nbuckets)
+	{
+		/*
+		 * We have enough space, so no batching.  In theory we could even
+		 * reduce nbuckets, but since that could lead to poor behavior if
+		 * estimated ntuples is much less than reality, it seems better to
+		 * make more buckets instead of fewer.
+		 */
+		totalbuckets = nbuckets;
+		nbatch = 0;
+	}
+	else
+	{
+		/*
+		 * Need to batch; compute how many batches we want to use. Note
+		 * that nbatch doesn't have to have anything to do with the ratio
+		 * totalbuckets/nbuckets; in fact, it is the number of groups we
+		 * will use for the part of the data that doesn't fall into the
+		 * first nbuckets hash buckets.
+		 */
+		nbatch = (int) ceil((inner_rel_bytes - hash_table_bytes) /
+							hash_table_bytes);
+		if (nbatch <= 0)
+			nbatch = 1;
+	}
+
+	/*
+	 * Now, totalbuckets is the number of (virtual) hashbuckets for the
+	 * whole relation, and nbuckets is the number of physical hashbuckets
+	 * we will use in the first pass.  Data falling into the first
+	 * nbuckets virtual hashbuckets gets handled in the first pass;
+	 * everything else gets divided into nbatch batches to be processed in
+	 * additional passes.
+	 */
+	*virtualbuckets = totalbuckets;
+	*physicalbuckets = nbuckets;
+	*numbatches = nbatch;
+}
+
+
 /* ----------------------------------------------------------------
 *		ExecHashTableDestroy
 *

--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -42,7 +42,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.76 2001/06/10 02:59:35 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.77 2001/06/11 00:17:08 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -791,19 +791,19 @@ cost_hashjoin(Path *path, Query *root,
 * smart enough to figure out how the restrict clauses might change the
 * distribution, so this will have to do for now.
 *
- * The executor tries for average bucket loading of NTUP_PER_BUCKET by setting
- * number of buckets equal to ntuples / NTUP_PER_BUCKET, which would yield
- * a bucketsize fraction of NTUP_PER_BUCKET / ntuples.  But that goal will
- * be reached only if the data values are uniformly distributed among the
- * buckets, which requires (a) at least ntuples / NTUP_PER_BUCKET distinct
- * data values, and (b) a not-too-skewed data distribution.  Otherwise the
- * buckets will be nonuniformly occupied.  If the other relation in the join
- * has a similar distribution, the most-loaded buckets are exactly those
- * that will be probed most often.  Therefore, the "average" bucket size for
- * costing purposes should really be taken as something close to the "worst
- * case" bucket size.  We try to estimate this by first scaling up if there
- * are too few distinct data values, and then scaling up again by the
- * ratio of the most common value's frequency to the average frequency.
+ * We can get the number of buckets the executor will use for the given
+ * input relation.  If the data were perfectly distributed, with the same
+ * number of tuples going into each available bucket, then the bucketsize
+ * fraction would be 1/nbuckets.  But this happy state of affairs will occur
+ * only if (a) there are at least nbuckets distinct data values, and (b)
+ * we have a not-too-skewed data distribution.  Otherwise the buckets will
+ * be nonuniformly occupied.  If the other relation in the join has a key
+ * distribution similar to this one's, then the most-loaded buckets are
+ * exactly those that will be probed most often.  Therefore, the "average"
+ * bucket size for costing purposes should really be taken as something close
+ * to the "worst case" bucket size.  We try to estimate this by adjusting the
+ * fraction if there are too few distinct data values, and then scaling up
+ * by the ratio of the most common value's frequency to the average frequency.
 *
 * If no statistics are available, use a default estimate of 0.1.  This will
 * discourage use of a hash rather strongly if the inner relation is large,
@@ -815,11 +815,13 @@ estimate_hash_bucketsize(Query *root, Var *var)
 {
 	Oid			relid;
 	RelOptInfo *rel;
+	int			virtualbuckets;
+	int			physicalbuckets;
+	int			numbatches;
 	HeapTuple	tuple;
 	Form_pg_statistic stats;
 	double		estfract,
 				ndistinct,
-				needdistinct,
 				mcvfreq,
 				avgfreq;
 	float4	   *numbers;
@@ -841,6 +843,12 @@ estimate_hash_bucketsize(Query *root, Var *var)
 	if (rel->tuples <= 0.0 || rel->rows <= 0.0)
 		return 0.1;				/* ensure we can divide below */

+	/* Get hash table size that executor would use for this relation */
+	ExecChooseHashTableSize(rel->rows, rel->width,
+							&virtualbuckets,
+							&physicalbuckets,
+							&numbatches);
+
 	tuple = SearchSysCache(STATRELATT,
 						   ObjectIdGetDatum(relid),
 						   Int16GetDatum(var->varattno),
@@ -857,7 +865,7 @@ estimate_hash_bucketsize(Query *root, Var *var)
 			case ObjectIdAttributeNumber:
 			case SelfItemPointerAttributeNumber:
 				/* these are unique, so buckets should be well-distributed */
-				return (double) NTUP_PER_BUCKET / rel->rows;
+				return 1.0 / (double) virtualbuckets;
 			case TableOidAttributeNumber:
 				/* hashing this is a terrible idea... */
 				return 1.0;
@@ -873,6 +881,12 @@ estimate_hash_bucketsize(Query *root, Var *var)
 	if (ndistinct < 0.0)
 		ndistinct = -ndistinct * rel->tuples;

+	if (ndistinct <= 0.0)		/* ensure we can divide */
+	{
+		ReleaseSysCache(tuple);
+		return 0.1;
+	}
+
 	/* Also compute avg freq of all distinct data values in raw relation */
 	avgfreq = (1.0 - stats->stanullfrac) / ndistinct;

@@ -887,20 +901,14 @@ estimate_hash_bucketsize(Query *root, Var *var)
 	ndistinct *= rel->rows / rel->tuples;

 	/*
-	 * Form initial estimate of bucketsize fraction.  Here we use rel->rows,
-	 * ie the number of rows after applying restriction clauses, because
-	 * that's what the fraction will eventually be multiplied by in
-	 * cost_heapjoin.
+	 * Initial estimate of bucketsize fraction is 1/nbuckets as long as
+	 * the number of buckets is less than the expected number of distinct
+	 * values; otherwise it is 1/ndistinct.
 	 */
-	estfract = (double) NTUP_PER_BUCKET / rel->rows;
-
-	/*
-	 * Adjust estimated bucketsize if too few distinct values (after
-	 * restriction clauses) to fill all the buckets.
-	 */
-	needdistinct = rel->rows / (double) NTUP_PER_BUCKET;
-	if (ndistinct < needdistinct)
-		estfract *= needdistinct / ndistinct;
+	if (ndistinct > (double) virtualbuckets)
+		estfract = 1.0 / (double) virtualbuckets;
+	else
+		estfract = 1.0 / ndistinct;

 	/*
 	 * Look up the frequency of the most common value, if available.

--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: nodeHash.h,v 1.19 2001/03/22 04:00:44 momjian Exp $
+ * $Id: nodeHash.h,v 1.20 2001/06/11 00:17:07 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -16,9 +16,6 @@

 #include "nodes/plannodes.h"

-/* NTUP_PER_BUCKET is exported because planner wants to see it */
-#define NTUP_PER_BUCKET			10
-
 extern TupleTableSlot *ExecHash(Hash *node);
 extern bool ExecInitHash(Hash *node, EState *estate, Plan *parent);
 extern int	ExecCountSlotsHash(Hash *node);
@@ -35,5 +32,9 @@ extern HeapTuple ExecScanHashBucket(HashJoinState *hjstate, List *hjclauses,
 				   ExprContext *econtext);
 extern void ExecHashTableReset(HashJoinTable hashtable, long ntuples);
 extern void ExecReScanHash(Hash *node, ExprContext *exprCtxt, Plan *parent);
+extern void ExecChooseHashTableSize(double ntuples, int tupwidth,
+									int *virtualbuckets,
+									int *physicalbuckets,
+									int *numbatches);

 #endif	 /* NODEHASH_H */