Change hash index creation so that rather than always establishing exactly

two buckets at the start, we create a number of buckets appropriate for the estimated size of the table. This avoids a lot of expensive bucket-split actions during initial index build on an already-populated table. This is one of the two core ideas of Tom Raney and Shreya Bhargava's patch to reduce hash index build time. I'm committing it separately to make it easier for people to test the effects of this separately from the effects of their other core idea (pre-sorting the index entries by bucket number).

Change hash index creation so that rather than always establishing exactly
two buckets at the start, we create a number of buckets appropriate for the estimated size of the table. This avoids a lot of expensive bucket-split actions during initial index build on an already-populated table. This is one of the two core ideas of Tom Raney and Shreya Bhargava's patch to reduce hash index build time. I'm committing it separately to make it easier for people to test the effects of this separately from the effects of their other core idea (pre-sorting the index entries by bucket number).
c9a1cc69 · Tom Lane · 4873c96f · c9a1cc69 · c9a1cc69 · c9a1cc69
6 changed file
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
-$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.6 2007/04/19 20:24:04 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.7 2008/03/15 20:46:31 tgl Exp $

 This directory contains an implementation of hash indexing for Postgres.  Most
 of the core ideas are taken from Margo Seltzer and Ozan Yigit, A New Hashing
@@ -65,6 +65,11 @@ hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
 former.  The difference between the two represents the number of overflow
 pages appearing between the bucket page groups of splitpoints N and N+1.

+(Note: the above describes what happens when filling an initially minimally
+sized hash index.  In practice, we try to estimate the required index size
+and allocate a suitable number of splitpoints immediately, to avoid
+expensive re-splitting during initial index build.)
+
 When S splitpoints exist altogether, the array entries hashm_spares[0]
 through hashm_spares[S] are valid; hashm_spares[S] records the current
 total number of overflow pages.  New overflow pages are created as needed
@@ -101,9 +106,9 @@ includes the bitmap pages, which is the reason for saying that bitmap
 pages are a subset of the overflow pages.  It turns out in fact that each
 bitmap page's first bit represents itself --- this is not an essential
 property, but falls out of the fact that we only allocate another bitmap
-page when we really need one.  Bit number zero always corresponds to block
-number 3, which is the first bitmap page and is allocated during index
-creation.
+page when we really need one.  Bit number zero always corresponds to the
+first bitmap page, which is allocated during index creation just after all
+the initially created buckets.


 Lock definitions

--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.98 2008/01/01 19:45:46 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.99 2008/03/15 20:46:31 tgl Exp $
 *
 * NOTES
 *	  This file contains only the public interface routines.
@@ -22,6 +22,7 @@
 #include "access/hash.h"
 #include "catalog/index.h"
 #include "commands/vacuum.h"
+#include "optimizer/plancat.h"


 /* Working state for hashbuild and its callback */
@@ -48,6 +49,7 @@ hashbuild(PG_FUNCTION_ARGS)
 	Relation	index = (Relation) PG_GETARG_POINTER(1);
 	IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
 	IndexBuildResult *result;
+	BlockNumber	relpages;
 	double		reltuples;
 	HashBuildState buildstate;

@@ -59,8 +61,11 @@ hashbuild(PG_FUNCTION_ARGS)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));

-	/* initialize the hash index metadata page */
-	_hash_metapinit(index);
+	/* estimate the number of rows currently present in the table */
+	estimate_rel_size(heap, NULL, &relpages, &reltuples);
+
+	/* initialize the hash index metadata page and initial buckets */
+	_hash_metapinit(index, reltuples);

 	/* build the index */
 	buildstate.indtuples = 0;

--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.72 2008/01/01 19:45:46 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.73 2008/03/15 20:46:31 tgl Exp $
 *
 * NOTES
 *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -312,15 +312,17 @@ _hash_chgbufaccess(Relation rel,

 /*
 *	_hash_metapinit() -- Initialize the metadata page of a hash index,
- *				the two buckets that we begin with and the initial
- *				bitmap page.
+ *				the initial buckets, and the initial bitmap page.
+ *
+ * The initial number of buckets is dependent on num_tuples, an estimate
+ * of the number of tuples to be loaded into the index initially.
 *
 * We are fairly cavalier about locking here, since we know that no one else
 * could be accessing this index.  In particular the rule about not holding
 * multiple buffer locks is ignored.
 */
 void
-_hash_metapinit(Relation rel)
+_hash_metapinit(Relation rel, double num_tuples)
 {
 	HashMetaPage metap;
 	HashPageOpaque pageopaque;
@@ -330,7 +332,10 @@ _hash_metapinit(Relation rel)
 	int32		data_width;
 	int32		item_width;
 	int32		ffactor;
-	uint16		i;
+	double		dnumbuckets;
+	uint32		num_buckets;
+	uint32		log2_num_buckets;
+	uint32		i;

 	/* safety check */
 	if (RelationGetNumberOfBlocks(rel) != 0)
@@ -354,7 +359,26 @@ _hash_metapinit(Relation rel)
 		ffactor = 10;

 	/*
-	 * We initialize the metapage, the first two bucket pages, and the first
+	 * Choose the number of initial bucket pages to match the fill factor
+	 * given the estimated number of tuples.  We round up the result to the
+	 * next power of 2, however, and always force at least 2 bucket pages.
+	 * The upper limit is determined by considerations explained in
+	 * _hash_expandtable().
+	 */
+	dnumbuckets = num_tuples / ffactor;
+	if (dnumbuckets <= 2.0)
+		num_buckets = 2;
+	else if (dnumbuckets >= (double) 0x40000000)
+		num_buckets = 0x40000000;
+	else
+		num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
+
+	log2_num_buckets = _hash_log2(num_buckets);
+	Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
+	Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
+
+	/*
+	 * We initialize the metapage, the first N bucket pages, and the first
 	 * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
 	 * calls to occur.	This ensures that the smgr level has the right idea of
 	 * the physical index length.
@@ -398,23 +422,25 @@ _hash_metapinit(Relation rel)
 	metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);

 	/*
-	 * We initialize the index with two buckets, 0 and 1, occupying physical
-	 * blocks 1 and 2.	The first freespace bitmap page is in block 3.
+	 * We initialize the index with N buckets, 0 .. N-1, occupying physical
+	 * blocks 1 to N.  The first freespace bitmap page is in block N+1.
+	 * Since N is a power of 2, we can set the masks this way:
 	 */
-	metap->hashm_maxbucket = metap->hashm_lowmask = 1;	/* nbuckets - 1 */
-	metap->hashm_highmask = 3;	/* (nbuckets << 1) - 1 */
+	metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
+	metap->hashm_highmask = (num_buckets << 1) - 1;

 	MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
 	MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));

-	metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
-	metap->hashm_ovflpoint = 1;
+	/* Set up mapping for one spare page after the initial splitpoints */
+	metap->hashm_spares[log2_num_buckets] = 1;
+	metap->hashm_ovflpoint = log2_num_buckets;
 	metap->hashm_firstfree = 0;

 	/*
-	 * Initialize the first two buckets
+	 * Initialize the first N buckets
 	 */
-	for (i = 0; i <= 1; i++)
+	for (i = 0; i < num_buckets; i++)
 	{
 		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
 		pg = BufferGetPage(buf);
@@ -430,7 +456,7 @@ _hash_metapinit(Relation rel)
 	/*
 	 * Initialize first bitmap page
 	 */
-	_hash_initbitmap(rel, metap, 3);
+	_hash_initbitmap(rel, metap, num_buckets + 1);

 	/* all done */
 	_hash_wrtbuf(rel, metabuf);
@@ -511,6 +537,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	 * index with 2^32 buckets would certainly overflow BlockNumber and hence
 	 * _hash_alloc_buckets() would fail, but if we supported buckets smaller
 	 * than a disk block then this would be an independent constraint.
+	 *
+	 * If you change this, see also the maximum initial number of buckets
+	 * in _hash_metapinit().
 	 */
 	if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
 		goto fail;

--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.140 2008/01/12 00:11:39 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.141 2008/03/15 20:46:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -45,8 +45,6 @@ bool		constraint_exclusion = false;
 get_relation_info_hook_type get_relation_info_hook = NULL;


-static void estimate_rel_size(Relation rel, int32 *attr_widths,
-				  BlockNumber *pages, double *tuples);
 static List *get_relation_constraints(Oid relationObjectId, RelOptInfo *rel,
 						 bool include_notnull);

@@ -319,7 +317,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 * relation's attr_width[] cache; we fill this in if we have need to compute
 * the attribute widths for estimation purposes.
 */
-static void
+void
 estimate_rel_size(Relation rel, int32 *attr_widths,
 				  BlockNumber *pages, double *tuples)
 {

--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.84 2008/01/01 19:45:56 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.85 2008/03/15 20:46:31 tgl Exp $
 *
 * NOTES
 *		modeled after Margo Seltzer's hash implementation for unix.
@@ -298,7 +298,7 @@ extern void _hash_dropbuf(Relation rel, Buffer buf);
 extern void _hash_wrtbuf(Relation rel, Buffer buf);
 extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
 				   int to_access);
-extern void _hash_metapinit(Relation rel);
+extern void _hash_metapinit(Relation rel, double num_tuples);
 extern void _hash_pageinit(Page page, Size size);
 extern void _hash_expandtable(Relation rel, Buffer metabuf);


--- a/src/include/optimizer/plancat.h
+++ b/src/include/optimizer/plancat.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/optimizer/plancat.h,v 1.47 2008/01/01 19:45:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/plancat.h,v 1.48 2008/03/15 20:46:31 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -15,6 +15,7 @@
 #define PLANCAT_H

 #include "nodes/relation.h"
+#include "utils/rel.h"

 /* Hook for plugins to get control in get_relation_info() */
 typedef void (*get_relation_info_hook_type) (PlannerInfo *root,
@@ -27,6 +28,9 @@ extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook;
 extern void get_relation_info(PlannerInfo *root, Oid relationObjectId,
 				  bool inhparent, RelOptInfo *rel);

+extern void estimate_rel_size(Relation rel, int32 *attr_widths,
+							  BlockNumber *pages, double *tuples);
+
 extern bool relation_excluded_by_constraints(RelOptInfo *rel,
 								 RangeTblEntry *rte);