Rewrite hash join to use simple linked lists instead of a

fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory.

Rewrite hash join to use simple linked lists instead of a
fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory.
26069a58 · Tom Lane · d261a5ec · 26069a58 · 26069a58 · 26069a58
5 changed file
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
 /*-------------------------------------------------------------------------
 *
 * hashjoin.h
- *	  internal structures for hash table and buckets
+ *	  internal structures for hash joins
 *
 *
 * Copyright (c) 1994, Regents of the University of California
 *
- * $Id: hashjoin.h,v 1.10 1999/05/09 00:53:18 tgl Exp $
+ * $Id: hashjoin.h,v 1.11 1999/05/18 21:33:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #ifndef HASHJOIN_H
 #define HASHJOIN_H

-#include <storage/ipc.h>
+#include "access/htup.h"
+#include "storage/fd.h"
+#include "utils/mcxt.h"

-/* -----------------
- *	have to use relative address as pointers in the hashtable
- *	because the hashtable may reallocate in different processes
+/* ----------------------------------------------------------------
+ *				hash-join hash table structures
+ *
+ * Each active hashjoin has a HashJoinTable control block which is
+ * palloc'd in the executor's context.  All other storage needed for
+ * the hashjoin is kept in a private "named portal", one for each hashjoin.
+ * This makes it easy and fast to release the storage when we don't need it
+ * anymore.
+ *
+ * The portal manager guarantees that portals will be discarded at end of
+ * transaction, so we have no problem with a memory leak if the join is
+ * aborted early by an error.  (Likewise, any temporary files we make will
+ * be cleaned up by the virtual file manager in event of an error.)
 *
- *  XXX: this relative-address stuff is useless on all supported platforms
- *  and is a ever-dangerous source of bugs.  Really ought to rip it out.
- * -----------------
+ * Storage that should live through the entire join is allocated from the
+ * portal's "variable context", while storage that is only wanted for the
+ * current batch is allocated in the portal's "heap context".  By popping
+ * the portal's heap at the end of a batch, we free all the per-batch storage
+ * reliably and without tedium.
+ * ----------------------------------------------------------------
 */
-typedef int RelativeAddr;

-/* ------------------
- *	The relative addresses are always relative to the head of the
- *	hashtable, the following macros convert them to/from absolute address.
- *  NULL is represented as -1 (CAUTION: RELADDR() doesn't handle that!).
- *  CAUTION: ABSADDR evaluates its arg twice!!
- * ------------------
- */
-#define ABSADDR(X)		((X) < 0 ? (char*) NULL : (char*)hashtable + (X))
-#define RELADDR(X)		((RelativeAddr)((char*)(X) - (char*)hashtable))
+typedef struct HashJoinTupleData
+{
+	struct HashJoinTupleData *next;	/* link to next tuple in same bucket */
+	HeapTupleData	htup;		/* tuple header */
+} HashJoinTupleData;

-typedef char **charPP;
-typedef int *intP;
+typedef HashJoinTupleData *HashJoinTuple;

-/* ----------------------------------------------------------------
- *				hash-join hash table structures
- * ----------------------------------------------------------------
- */
 typedef struct HashTableData
 {
-	int			nbuckets;
-	int			totalbuckets;
-	int			bucketsize;
-	IpcMemoryId shmid;
-	RelativeAddr top;			/* char* */
-	RelativeAddr bottom;		/* char* */
-	RelativeAddr overflownext;	/* char* */
-	RelativeAddr batch;			/* char* */
-	RelativeAddr readbuf;		/* char* */
-	int			nbatch;
-	RelativeAddr outerbatchPos; /* RelativeAddr* */
-	RelativeAddr innerbatchPos; /* RelativeAddr* */
-	RelativeAddr innerbatchSizes;		/* int* */
-	int			curbatch;
-	int			nprocess;
-	int			pcount;
-} HashTableData;				/* real hash table follows here */
+	int			nbuckets;		/* buckets in use during this batch */
+	int			totalbuckets;	/* total number of (virtual) buckets */
+	HashJoinTuple  *buckets;	/* buckets[i] is head of list of tuples */
+	/* buckets array is per-batch storage, as are all the tuples */

-typedef HashTableData *HashJoinTable;
+	int			nbatch;			/* number of batches; 0 means 1-pass join */
+	int			curbatch;		/* current batch #, or 0 during 1st pass */

-typedef struct OverflowTupleData
-{
-	RelativeAddr tuple;			/* HeapTuple */
-	RelativeAddr next;			/* struct OverflowTupleData * */
-} OverflowTupleData;			/* real tuple follows here */
+	/* all these arrays are allocated for the life of the hash join,
+	 * but only if nbatch > 0:
+	 */
+	BufFile	  **innerBatchFile;	/* buffered virtual temp file per batch */
+	BufFile	  **outerBatchFile;	/* buffered virtual temp file per batch */
+	long	   *outerBatchSize; /* count of tuples in each outer batch file */
+	long	   *innerBatchSize; /* count of tuples in each inner batch file */

-typedef OverflowTupleData *OverflowTuple;
+	/* During 1st scan of inner relation, we get tuples from executor.
+	 * If nbatch > 0 then tuples that don't belong in first nbuckets logical
+	 * buckets get dumped into inner-batch temp files.
+	 * The same statements apply for the 1st scan of the outer relation,
+	 * except we write tuples to outer-batch temp files.
+	 * If nbatch > 0 then we do the following for each batch:
+	 *  1. Read tuples from inner batch file, load into hash buckets.
+	 *  2. Read tuples from outer batch file, match to hash buckets and output.
+	 */

-typedef struct HashBucketData
-{
-	RelativeAddr top;			/* HeapTuple */
-	RelativeAddr bottom;		/* HeapTuple */
-	RelativeAddr firstotuple;	/* OverflowTuple */
-	RelativeAddr lastotuple;	/* OverflowTuple */
-} HashBucketData;				/* real bucket follows here */
+	/* Ugly kluge: myPortal ought to be declared as type Portal (ie, PortalD*)
+	 * but if we try to include utils/portal.h here, we end up with a
+	 * circular dependency of include files!  Until the various node.h files
+	 * are restructured in a cleaner way, we have to fake it.  The most
+	 * reliable fake seems to be to declare myPortal as void * and then
+	 * cast it to the right things in nodeHash.c.
+	 */
+	void		   *myPortal;	/* where to keep working storage */
+	MemoryContext	hashCxt;	/* context for whole-hash-join storage */
+	MemoryContext	batchCxt;	/* context for this-batch-only storage */
+} HashTableData;

-typedef HashBucketData *HashBucket;
+typedef HashTableData *HashJoinTable;

 #endif	 /* HASHJOIN_H */
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -6,7 +6,7 @@
 *
 * Copyright (c) 1994, Regents of the University of California
 *
- * $Id: nodeHash.h,v 1.11 1999/02/13 23:21:25 momjian Exp $
+ * $Id: nodeHash.h,v 1.12 1999/05/18 21:33:05 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -18,7 +18,6 @@
 #include "nodes/execnodes.h"
 #include "nodes/pg_list.h"
 #include "nodes/plannodes.h"
-#include "storage/fd.h"
 #include "utils/syscache.h"

 extern TupleTableSlot *ExecHash(Hash *node);
@@ -26,15 +25,14 @@ extern bool ExecInitHash(Hash *node, EState *estate, Plan *parent);
 extern int	ExecCountSlotsHash(Hash *node);
 extern void ExecEndHash(Hash *node);
 extern HashJoinTable ExecHashTableCreate(Hash *node);
-extern void ExecHashTableInsert(HashJoinTable hashtable, ExprContext *econtext,
-					Var *hashkey, File *batches);
 extern void ExecHashTableDestroy(HashJoinTable hashtable);
+extern void ExecHashTableInsert(HashJoinTable hashtable, ExprContext *econtext,
+								Var *hashkey);
 extern int ExecHashGetBucket(HashJoinTable hashtable, ExprContext *econtext,
-				  Var *hashkey);
-extern HeapTuple ExecScanHashBucket(HashJoinState *hjstate, HashBucket bucket,
-				   HeapTuple curtuple, List *hjclauses,
-				   ExprContext *econtext);
-extern void ExecHashTableReset(HashJoinTable hashtable, int ntuples);
+							 Var *hashkey);
+extern HeapTuple ExecScanHashBucket(HashJoinState *hjstate, List *hjclauses,
+									ExprContext *econtext);
+extern void ExecHashTableReset(HashJoinTable hashtable, long ntuples);
 extern void ExecReScanHash(Hash *node, ExprContext *exprCtxt, Plan *parent);

 #endif	 /* NODEHASH_H */
--- a/src/include/executor/nodeHashjoin.h
+++ b/src/include/executor/nodeHashjoin.h
@@ -6,7 +6,7 @@
 *
 * Copyright (c) 1994, Regents of the University of California
 *
- * $Id: nodeHashjoin.h,v 1.11 1999/02/13 23:21:26 momjian Exp $
+ * $Id: nodeHashjoin.h,v 1.12 1999/05/18 21:33:05 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -21,9 +21,7 @@ extern TupleTableSlot *ExecHashJoin(HashJoin *node);
 extern bool ExecInitHashJoin(HashJoin *node, EState *estate, Plan *parent);
 extern int	ExecCountSlotsHashJoin(HashJoin *node);
 extern void ExecEndHashJoin(HashJoin *node);
-extern char *ExecHashJoinSaveTuple(HeapTuple heapTuple, char *buffer,
-					  File file, char *position);
+extern void ExecHashJoinSaveTuple(HeapTuple heapTuple, BufFile *file);
 extern void ExecReScanHashJoin(HashJoin *node, ExprContext *exprCtxt, Plan *parent);

-
 #endif	 /* NODEHASHJOIN_H */