nodeHashjoin.c 24.8 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * nodeHashjoin.c
4
 *	  Routines to handle hash join nodes
5
 *
6
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
7
 * Portions Copyright (c) 1994, Regents of the University of California
8 9 10
 *
 *
 * IDENTIFICATION
11
 *	  $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.81 2006/03/05 15:58:26 momjian Exp $
12 13 14
 *
 *-------------------------------------------------------------------------
 */
15

B
Bruce Momjian 已提交
16
#include "postgres.h"
17 18

#include "executor/executor.h"
19
#include "executor/hashjoin.h"
20 21
#include "executor/nodeHash.h"
#include "executor/nodeHashjoin.h"
B
Bruce Momjian 已提交
22
#include "optimizer/clauses.h"
23 24
#include "utils/memutils.h"

25

26
static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
B
Bruce Momjian 已提交
27 28
						  HashJoinState *hjstate,
						  uint32 *hashvalue);
29
static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
30
						  BufFile *file,
31
						  uint32 *hashvalue,
B
Bruce Momjian 已提交
32
						  TupleTableSlot *tupleSlot);
33
static int	ExecHashJoinNewBatch(HashJoinState *hjstate);
34

35

36
/* ----------------------------------------------------------------
37
 *		ExecHashJoin
38
 *
39
 *		This function implements the Hybrid Hashjoin algorithm.
40 41 42
 *
 *		Note: the relation we build hash table on is the "inner"
 *			  the other one is "outer".
43 44
 * ----------------------------------------------------------------
 */
45
TupleTableSlot *				/* return: a tuple or NULL */
46
ExecHashJoin(HashJoinState *node)
47
{
48
	EState	   *estate;
49 50
	PlanState  *outerNode;
	HashState  *hashNode;
51 52
	List	   *joinqual;
	List	   *otherqual;
53
	TupleTableSlot *inntuple;
54
	ExprContext *econtext;
55
	ExprDoneCond isDone;
56 57
	HashJoinTable hashtable;
	HeapTuple	curtuple;
58
	TupleTableSlot *outerTupleSlot;
59 60
	uint32		hashvalue;
	int			batchno;
61

62 63
	/*
	 * get information from HashJoin node
64
	 */
65 66 67 68 69
	estate = node->js.ps.state;
	joinqual = node->js.joinqual;
	otherqual = node->js.ps.qual;
	hashNode = (HashState *) innerPlanState(node);
	outerNode = outerPlanState(node);
70

71
	/*
72
	 * get information from HashJoin state
73
	 */
74 75
	hashtable = node->hj_HashTable;
	econtext = node->js.ps.ps_ExprContext;
76

77
	/*
B
Bruce Momjian 已提交
78 79 80
	 * Check to see if we're still projecting out tuples from a previous join
	 * tuple (because there is a function-returning-set in the projection
	 * expressions).  If so, try to project another one.
81
	 */
82
	if (node->js.ps.ps_TupFromTlist)
83 84 85
	{
		TupleTableSlot *result;

86
		result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
87
		if (isDone == ExprMultipleResult)
88
			return result;
89
		/* Done with that source tuple... */
90
		node->js.ps.ps_TupFromTlist = false;
91
	}
92

93
	/*
B
Bruce Momjian 已提交
94 95 96
	 * If we're doing an IN join, we want to return at most one row per outer
	 * tuple; so we can stop scanning the inner scan if we matched on the
	 * previous try.
97
	 */
98
	if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
99 100
		node->hj_NeedNewOuter = true;

101 102
	/*
	 * Reset per-tuple memory context to free any expression evaluation
B
Bruce Momjian 已提交
103 104
	 * storage allocated in the previous tuple cycle.  Note this can't happen
	 * until we're done projecting out tuples from a join tuple.
105 106 107
	 */
	ResetExprContext(econtext);

108 109 110
	/*
	 * if this is the first call, build the hash table for inner relation
	 */
111
	if (hashtable == NULL)
112 113
	{
		/*
114
		 * If the outer relation is completely empty, we can quit without
B
Bruce Momjian 已提交
115 116 117 118 119 120 121
		 * building the hash table.  However, for an inner join it is only a
		 * win to check this when the outer relation's startup cost is less
		 * than the projected cost of building the hash table.	Otherwise it's
		 * best to build the hash table first and see if the inner relation is
		 * empty.  (When it's an outer join, we should always make this check,
		 * since we aren't going to be able to skip the join on the strength
		 * of an empty inner relation anyway.)
122
		 *
123 124 125 126 127 128
		 * If we are rescanning the join, we make use of information gained
		 * on the previous scan: don't bother to try the prefetch if the
		 * previous scan found the outer relation nonempty.  This is not
		 * 100% reliable since with new parameters the outer relation might
		 * yield different results, but it's a good heuristic.
		 *
B
Bruce Momjian 已提交
129 130 131
		 * The only way to make the check is to try to fetch a tuple from the
		 * outer plan node.  If we succeed, we have to stash it away for later
		 * consumption by ExecHashJoinOuterGetTuple.
132
		 */
133 134 135
		if (node->js.jointype == JOIN_LEFT ||
			(outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
			 !node->hj_OuterNotEmpty))
136
		{
137 138
			node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
			if (TupIsNull(node->hj_FirstOuterTupleSlot))
139 140
			{
				node->hj_OuterNotEmpty = false;
141
				return NULL;
142 143 144
			}
			else
				node->hj_OuterNotEmpty = true;
145
		}
146 147
		else
			node->hj_FirstOuterTupleSlot = NULL;
148 149

		/*
150
		 * create the hash table
151
		 */
152 153
		hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
										node->hj_HashOperators);
154
		node->hj_HashTable = hashtable;
155

156
		/*
157
		 * execute the Hash node, to build the hash table
158
		 */
159
		hashNode->hashtable = hashtable;
160
		(void) MultiExecProcNode((PlanState *) hashNode);
161

162
		/*
B
Bruce Momjian 已提交
163 164
		 * If the inner relation is completely empty, and we're not doing an
		 * outer join, we can quit without scanning the outer relation.
165
		 */
166
		if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
167 168
			return NULL;

169
		/*
170 171
		 * need to remember whether nbatch has increased since we began
		 * scanning the outer relation
172
		 */
173
		hashtable->nbatch_outstart = hashtable->nbatch;
174 175 176 177 178 179 180

		/*
		 * Reset OuterNotEmpty for scan.  (It's OK if we fetched a tuple
		 * above, because ExecHashJoinOuterGetTuple will immediately
		 * set it again.)
		 */
		node->hj_OuterNotEmpty = false;
181
	}
182

183
	/*
184
	 * run the hash join process
185
	 */
186
	for (;;)
187
	{
188
		/*
189
		 * If we don't have an outer tuple, get the next one
190
		 */
191
		if (node->hj_NeedNewOuter)
192
		{
193 194 195
			outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
													   node,
													   &hashvalue);
196
			if (TupIsNull(outerTupleSlot))
197 198 199 200 201 202 203 204 205 206 207
			{
				/* end of join */
				return NULL;
			}

			node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
			econtext->ecxt_outertuple = outerTupleSlot;
			node->hj_NeedNewOuter = false;
			node->hj_MatchedOuter = false;

			/*
B
Bruce Momjian 已提交
208 209
			 * now we have an outer tuple, find the corresponding bucket for
			 * this tuple from the hash table
210 211 212 213 214 215 216
			 */
			node->hj_CurHashValue = hashvalue;
			ExecHashGetBucketAndBatch(hashtable, hashvalue,
									  &node->hj_CurBucketNo, &batchno);
			node->hj_CurTuple = NULL;

			/*
B
Bruce Momjian 已提交
217 218
			 * Now we've got an outer tuple and the corresponding hash bucket,
			 * but this tuple may not belong to the current batch.
219 220 221 222
			 */
			if (batchno != hashtable->curbatch)
			{
				/*
B
Bruce Momjian 已提交
223 224
				 * Need to postpone this outer tuple to a later batch. Save it
				 * in the corresponding outer-batch file.
225 226 227 228 229 230
				 */
				Assert(batchno > hashtable->curbatch);
				ExecHashJoinSaveTuple(ExecFetchSlotTuple(outerTupleSlot),
									  hashvalue,
									  &hashtable->outerBatchFile[batchno]);
				node->hj_NeedNewOuter = true;
B
Bruce Momjian 已提交
231
				continue;		/* loop around for a new outer tuple */
232
			}
233 234
		}

235 236
		/*
		 * OK, scan the selected hash bucket for matches
237
		 */
238
		for (;;)
239
		{
240
			curtuple = ExecScanHashBucket(node, econtext);
241 242
			if (curtuple == NULL)
				break;			/* out of matches */
B
Bruce Momjian 已提交
243

244
			/*
245
			 * we've got a match, but still need to test non-hashed quals
246
			 */
247
			inntuple = ExecStoreTuple(curtuple,
248
									  node->hj_HashTupleSlot,
249 250 251
									  InvalidBuffer,
									  false);	/* don't pfree this tuple */
			econtext->ecxt_innertuple = inntuple;
252

253
			/* reset temp memory each time to avoid leaks from qual expr */
254 255
			ResetExprContext(econtext);

256 257
			/*
			 * if we pass the qual, then save state for next call and have
B
Bruce Momjian 已提交
258 259
			 * ExecProject form the projection, store it in the tuple table,
			 * and return the slot.
260
			 *
B
Bruce Momjian 已提交
261 262
			 * Only the joinquals determine MatchedOuter status, but all quals
			 * must pass to actually return the tuple.
263
			 */
264
			if (joinqual == NIL || ExecQual(joinqual, econtext, false))
265
			{
266
				node->hj_MatchedOuter = true;
267

268
				if (otherqual == NIL || ExecQual(otherqual, econtext, false))
269
				{
270 271
					TupleTableSlot *result;

272
					result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
273 274 275

					if (isDone != ExprEndResult)
					{
276
						node->js.ps.ps_TupFromTlist =
277 278 279
							(isDone == ExprMultipleResult);
						return result;
					}
280
				}
281

B
Bruce Momjian 已提交
282
				/*
B
Bruce Momjian 已提交
283
				 * If we didn't return a tuple, may need to set NeedNewOuter
B
Bruce Momjian 已提交
284
				 */
285 286 287 288 289
				if (node->js.jointype == JOIN_IN)
				{
					node->hj_NeedNewOuter = true;
					break;		/* out of loop over hash bucket */
				}
290 291 292
			}
		}

293 294
		/*
		 * Now the current outer tuple has run out of matches, so check
B
Bruce Momjian 已提交
295 296
		 * whether to emit a dummy outer-join tuple. If not, loop around to
		 * get a new outer tuple.
297
		 */
298
		node->hj_NeedNewOuter = true;
299

300 301
		if (!node->hj_MatchedOuter &&
			node->js.jointype == JOIN_LEFT)
302 303
		{
			/*
B
Bruce Momjian 已提交
304 305 306
			 * We are doing an outer join and there were no join matches for
			 * this outer tuple.  Generate a fake join tuple with nulls for
			 * the inner tuple, and return it if it passes the non-join quals.
307
			 */
308
			econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
309 310 311

			if (ExecQual(otherqual, econtext, false))
			{
312
				/*
B
Bruce Momjian 已提交
313 314
				 * qualification was satisfied so we project and return the
				 * slot containing the result tuple using ExecProject().
315 316 317
				 */
				TupleTableSlot *result;

318
				result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
319 320 321

				if (isDone != ExprEndResult)
				{
322
					node->js.ps.ps_TupFromTlist =
323 324 325 326 327
						(isDone == ExprMultipleResult);
					return result;
				}
			}
		}
328
	}
329 330 331
}

/* ----------------------------------------------------------------
332
 *		ExecInitHashJoin
333
 *
334
 *		Init routine for HashJoin node.
335 336
 * ----------------------------------------------------------------
 */
337
HashJoinState *
338
ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
339
{
340 341 342
	HashJoinState *hjstate;
	Plan	   *outerNode;
	Hash	   *hashNode;
343 344
	List	   *lclauses;
	List	   *rclauses;
345
	List	   *hoperators;
346
	ListCell   *l;
347

348 349 350
	/* check for unsupported flags */
	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));

351
	/*
352 353 354
	 * create state structure
	 */
	hjstate = makeNode(HashJoinState);
355 356
	hjstate->js.ps.plan = (Plan *) node;
	hjstate->js.ps.state = estate;
357

358 359
	/*
	 * Miscellaneous initialization
360
	 *
361
	 * create expression context for node
362
	 */
363 364 365 366 367 368
	ExecAssignExprContext(estate, &hjstate->js.ps);

	/*
	 * initialize child expressions
	 */
	hjstate->js.ps.targetlist = (List *)
369
		ExecInitExpr((Expr *) node->join.plan.targetlist,
370 371
					 (PlanState *) hjstate);
	hjstate->js.ps.qual = (List *)
372
		ExecInitExpr((Expr *) node->join.plan.qual,
373 374 375
					 (PlanState *) hjstate);
	hjstate->js.jointype = node->join.jointype;
	hjstate->js.joinqual = (List *)
376
		ExecInitExpr((Expr *) node->join.joinqual,
377 378
					 (PlanState *) hjstate);
	hjstate->hashclauses = (List *)
379
		ExecInitExpr((Expr *) node->hashclauses,
380
					 (PlanState *) hjstate);
381

382
	/*
383
	 * initialize child nodes
384 385 386 387
	 *
	 * Note: we could suppress the REWIND flag for the inner input, which
	 * would amount to betting that the hash will be a single batch.  Not
	 * clear if this would be a win or not.
388
	 */
389 390
	outerNode = outerPlan(node);
	hashNode = (Hash *) innerPlan(node);
391

392 393
	outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
	innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
394

395
#define HASHJOIN_NSLOTS 3
396 397 398

	/*
	 * tuple table initialization
399
	 */
400
	ExecInitResultTupleSlot(estate, &hjstate->js.ps);
401 402 403 404 405
	hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);

	switch (node->join.jointype)
	{
		case JOIN_INNER:
406
		case JOIN_IN:
407 408 409 410
			break;
		case JOIN_LEFT:
			hjstate->hj_NullInnerTupleSlot =
				ExecInitNullTupleSlot(estate,
B
Bruce Momjian 已提交
411
								 ExecGetResultType(innerPlanState(hjstate)));
412 413
			break;
		default:
414
			elog(ERROR, "unrecognized join type: %d",
415 416 417
				 (int) node->join.jointype);
	}

418
	/*
B
Bruce Momjian 已提交
419 420 421 422 423
	 * now for some voodoo.  our temporary tuple slot is actually the result
	 * tuple slot of the Hash node (which is our inner plan).  we do this
	 * because Hash nodes don't return tuples via ExecProcNode() -- instead
	 * the hash join node uses ExecScanHashBucket() to get at the contents of
	 * the hash table.	-cim 6/9/91
424 425
	 */
	{
426 427
		HashState  *hashstate = (HashState *) innerPlanState(hjstate);
		TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
428 429 430 431

		hjstate->hj_HashTupleSlot = slot;
	}

432 433
	/*
	 * initialize tuple type and projection info
434
	 */
435 436
	ExecAssignResultTypeFromTL(&hjstate->js.ps);
	ExecAssignProjectionInfo(&hjstate->js.ps);
437

438
	ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
439
						  ExecGetResultType(outerPlanState(hjstate)),
440
						  false);
441

442 443
	/*
	 * initialize hash-specific info
444
	 */
445
	hjstate->hj_HashTable = NULL;
446
	hjstate->hj_FirstOuterTupleSlot = NULL;
447

448
	hjstate->hj_CurHashValue = 0;
449
	hjstate->hj_CurBucketNo = 0;
450
	hjstate->hj_CurTuple = NULL;
451 452

	/*
B
Bruce Momjian 已提交
453 454 455 456
	 * Deconstruct the hash clauses into outer and inner argument values, so
	 * that we can evaluate those subexpressions separately.  Also make a list
	 * of the hash operator OIDs, in preparation for looking up the hash
	 * functions to use.
457
	 */
458 459
	lclauses = NIL;
	rclauses = NIL;
460
	hoperators = NIL;
461
	foreach(l, hjstate->hashclauses)
462
	{
463
		FuncExprState *fstate = (FuncExprState *) lfirst(l);
464
		OpExpr	   *hclause;
465

466 467
		Assert(IsA(fstate, FuncExprState));
		hclause = (OpExpr *) fstate->xprstate.expr;
468
		Assert(IsA(hclause, OpExpr));
469
		lclauses = lappend(lclauses, linitial(fstate->args));
470
		rclauses = lappend(rclauses, lsecond(fstate->args));
471
		hoperators = lappend_oid(hoperators, hclause->opno);
472
	}
473 474
	hjstate->hj_OuterHashKeys = lclauses;
	hjstate->hj_InnerHashKeys = rclauses;
475
	hjstate->hj_HashOperators = hoperators;
476 477
	/* child Hash node needs to evaluate inner hash keys, too */
	((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
478

479 480
	hjstate->js.ps.ps_OuterTupleSlot = NULL;
	hjstate->js.ps.ps_TupFromTlist = false;
481 482
	hjstate->hj_NeedNewOuter = true;
	hjstate->hj_MatchedOuter = false;
483
	hjstate->hj_OuterNotEmpty = false;
484

485
	return hjstate;
486 487 488
}

int
489
ExecCountSlotsHashJoin(HashJoin *node)
490
{
491
	return ExecCountSlotsNode(outerPlan(node)) +
492 493
		ExecCountSlotsNode(innerPlan(node)) +
		HASHJOIN_NSLOTS;
494 495 496
}

/* ----------------------------------------------------------------
497
 *		ExecEndHashJoin
498
 *
499
 *		clean up routine for HashJoin node
500 501 502
 * ----------------------------------------------------------------
 */
void
503
ExecEndHashJoin(HashJoinState *node)
504
{
505
	/*
506
	 * Free hash table
507
	 */
508
	if (node->hj_HashTable)
509
	{
510 511
		ExecHashTableDestroy(node->hj_HashTable);
		node->hj_HashTable = NULL;
512 513
	}

514
	/*
515
	 * Free the exprcontext
516
	 */
517
	ExecFreeExprContext(&node->js.ps);
518

519 520
	/*
	 * clean out the tuple table
521
	 */
522 523 524
	ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
	ExecClearTuple(node->hj_OuterTupleSlot);
	ExecClearTuple(node->hj_HashTupleSlot);
525

526 527 528 529 530
	/*
	 * clean up subtrees
	 */
	ExecEndNode(outerPlanState(node));
	ExecEndNode(innerPlanState(node));
531 532
}

533 534
/*
 * ExecHashJoinOuterGetTuple
535
 *
536
 *		get the next outer tuple for hashjoin: either by
537 538 539 540 541 542
 *		executing a plan node in the first pass, or from
 *		the temp files for the hashjoin batches.
 *
 * Returns a null slot if no more outer tuples.  On success, the tuple's
 * hash value is stored at *hashvalue --- this is either originally computed,
 * or re-read from the temp file.
543 544
 */
static TupleTableSlot *
545 546 547
ExecHashJoinOuterGetTuple(PlanState *outerNode,
						  HashJoinState *hjstate,
						  uint32 *hashvalue)
548
{
B
Bruce Momjian 已提交
549 550
	HashJoinTable hashtable = hjstate->hj_HashTable;
	int			curbatch = hashtable->curbatch;
551 552 553 554
	TupleTableSlot *slot;

	if (curbatch == 0)
	{							/* if it is the first pass */
B
Bruce Momjian 已提交
555

556 557 558 559 560 561 562 563 564
		/*
		 * Check to see if first outer tuple was already fetched by
		 * ExecHashJoin() and not used yet.
		 */
		slot = hjstate->hj_FirstOuterTupleSlot;
		if (!TupIsNull(slot))
			hjstate->hj_FirstOuterTupleSlot = NULL;
		else
			slot = ExecProcNode(outerNode);
B
Bruce Momjian 已提交
565
		if (!TupIsNull(slot))
566 567 568 569 570 571 572 573 574 575
		{
			/*
			 * We have to compute the tuple's hash value.
			 */
			ExprContext *econtext = hjstate->js.ps.ps_ExprContext;

			econtext->ecxt_outertuple = slot;
			*hashvalue = ExecHashGetHashValue(hashtable, econtext,
											  hjstate->hj_OuterHashKeys);

576 577 578
			/* remember outer relation is not empty for possible rescan */
			hjstate->hj_OuterNotEmpty = true;

579
			return slot;
580
		}
B
Bruce Momjian 已提交
581

582
		/*
B
Bruce Momjian 已提交
583 584
		 * We have just reached the end of the first pass. Try to switch to a
		 * saved batch.
585 586
		 */
		curbatch = ExecHashJoinNewBatch(hjstate);
587 588 589
	}

	/*
B
Bruce Momjian 已提交
590 591 592
	 * Try to read from a temp file. Loop allows us to advance to new batches
	 * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
	 * don't try to optimize this loop.
593
	 */
594
	while (curbatch < hashtable->nbatch)
595 596
	{
		slot = ExecHashJoinGetSavedTuple(hjstate,
597 598
										 hashtable->outerBatchFile[curbatch],
										 hashvalue,
599
										 hjstate->hj_OuterTupleSlot);
B
Bruce Momjian 已提交
600
		if (!TupIsNull(slot))
601 602 603 604 605 606
			return slot;
		curbatch = ExecHashJoinNewBatch(hjstate);
	}

	/* Out of batches... */
	return NULL;
607 608
}

609 610
/*
 * ExecHashJoinNewBatch
611
 *		switch to a new hashjoin batch
612 613 614
 *
 * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
 * We will never return a batch number that has an empty outer batch file.
615
 */
616
static int
617
ExecHashJoinNewBatch(HashJoinState *hjstate)
618
{
619
	HashJoinTable hashtable = hjstate->hj_HashTable;
620 621
	int			nbatch;
	int			curbatch;
B
Bruce Momjian 已提交
622
	BufFile    *innerFile;
623
	TupleTableSlot *slot;
624
	uint32		hashvalue;
625

626 627 628 629 630
start_over:
	nbatch = hashtable->nbatch;
	curbatch = hashtable->curbatch;

	if (curbatch > 0)
631 632
	{
		/*
B
Bruce Momjian 已提交
633 634
		 * We no longer need the previous outer batch file; close it right
		 * away to free disk space.
635
		 */
636 637 638
		if (hashtable->outerBatchFile[curbatch])
			BufFileClose(hashtable->outerBatchFile[curbatch]);
		hashtable->outerBatchFile[curbatch] = NULL;
639 640
	}

641
	/*
642 643 644 645
	 * We can always skip over any batches that are completely empty on both
	 * sides.  We can sometimes skip over batches that are empty on only one
	 * side, but there are exceptions:
	 *
B
Bruce Momjian 已提交
646 647
	 * 1. In a LEFT JOIN, we have to process outer batches even if the inner
	 * batch is empty.
648
	 *
649 650
	 * 2. If we have increased nbatch since the initial estimate, we have to
	 * scan inner batches since they might contain tuples that need to be
B
Bruce Momjian 已提交
651
	 * reassigned to later inner batches.
652
	 *
653 654 655
	 * 3. Similarly, if we have increased nbatch since starting the outer
	 * scan, we have to rescan outer batches in case they contain tuples that
	 * need to be reassigned.
656
	 */
657 658 659 660
	curbatch++;
	while (curbatch < nbatch &&
		   (hashtable->outerBatchFile[curbatch] == NULL ||
			hashtable->innerBatchFile[curbatch] == NULL))
661
	{
662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679
		if (hashtable->outerBatchFile[curbatch] &&
			hjstate->js.jointype == JOIN_LEFT)
			break;				/* must process due to rule 1 */
		if (hashtable->innerBatchFile[curbatch] &&
			nbatch != hashtable->nbatch_original)
			break;				/* must process due to rule 2 */
		if (hashtable->outerBatchFile[curbatch] &&
			nbatch != hashtable->nbatch_outstart)
			break;				/* must process due to rule 3 */
		/* We can ignore this batch. */
		/* Release associated temp files right away. */
		if (hashtable->innerBatchFile[curbatch])
			BufFileClose(hashtable->innerBatchFile[curbatch]);
		hashtable->innerBatchFile[curbatch] = NULL;
		if (hashtable->outerBatchFile[curbatch])
			BufFileClose(hashtable->outerBatchFile[curbatch]);
		hashtable->outerBatchFile[curbatch] = NULL;
		curbatch++;
680
	}
681

682 683
	if (curbatch >= nbatch)
		return curbatch;		/* no more batches */
684

685
	hashtable->curbatch = curbatch;
686 687

	/*
688
	 * Reload the hash table with the new inner batch (which could be empty)
689
	 */
690
	ExecHashTableReset(hashtable);
691

692
	innerFile = hashtable->innerBatchFile[curbatch];
693

694
	if (innerFile != NULL)
695
	{
696 697 698
		if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
			ereport(ERROR,
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
699
				   errmsg("could not rewind hash-join temporary file: %m")));
700 701 702 703 704 705 706

		while ((slot = ExecHashJoinGetSavedTuple(hjstate,
												 innerFile,
												 &hashvalue,
												 hjstate->hj_HashTupleSlot)))
		{
			/*
B
Bruce Momjian 已提交
707 708
			 * NOTE: some tuples may be sent to future batches.  Also, it is
			 * possible for hashtable->nbatch to be increased here!
709
			 */
710 711 712
			ExecHashTableInsert(hashtable,
								ExecFetchSlotTuple(slot),
								hashvalue);
713 714 715 716 717 718 719 720
		}

		/*
		 * after we build the hash table, the inner batch file is no longer
		 * needed
		 */
		BufFileClose(innerFile);
		hashtable->innerBatchFile[curbatch] = NULL;
721 722 723
	}

	/*
724
	 * If there's no outer batch file, advance to next batch.
725
	 */
726 727
	if (hashtable->outerBatchFile[curbatch] == NULL)
		goto start_over;
728

729 730 731 732 733 734 735 736 737
	/*
	 * Rewind outer batch file, so that we can start reading it.
	 */
	if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not rewind hash-join temporary file: %m")));

	return curbatch;
738 739
}

740 741 742
/*
 * ExecHashJoinSaveTuple
 *		save a tuple to a batch file.
743
 *
744 745 746
 * The data recorded in the file for each tuple is its hash value,
 * then an image of its HeapTupleData (with meaningless t_data pointer)
 * followed by the HeapTupleHeader and tuple data.
747
 *
748 749 750
 * Note: it is important always to call this in the regular executor
 * context, not in a shorter-lived context; else the temp file buffers
 * will get messed up.
751
 */
752
void
753 754
ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
					  BufFile **fileptr)
755
{
B
Bruce Momjian 已提交
756
	BufFile    *file = *fileptr;
B
Bruce Momjian 已提交
757
	size_t		written;
758

759 760 761 762 763 764 765 766 767 768 769 770 771
	if (file == NULL)
	{
		/* First write to this batch file, so open it. */
		file = BufFileCreateTemp(false);
		*fileptr = file;
	}

	written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
	if (written != sizeof(uint32))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write to hash-join temporary file: %m")));

772 773
	written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
	if (written != sizeof(HeapTupleData))
774 775
		ereport(ERROR,
				(errcode_for_file_access(),
776 777
				 errmsg("could not write to hash-join temporary file: %m")));

778 779
	written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
	if (written != (size_t) heapTuple->t_len)
780 781
		ereport(ERROR,
				(errcode_for_file_access(),
782
				 errmsg("could not write to hash-join temporary file: %m")));
783
}
V
Vadim B. Mikheev 已提交
784

785 786
/*
 * ExecHashJoinGetSavedTuple
B
Bruce Momjian 已提交
787
 *		read the next tuple from a batch file.	Return NULL if no more.
788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
 *
 * On success, *hashvalue is set to the tuple's hash value, and the tuple
 * itself is stored in the given slot.
 */
static TupleTableSlot *
ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
						  BufFile *file,
						  uint32 *hashvalue,
						  TupleTableSlot *tupleSlot)
{
	HeapTupleData htup;
	size_t		nread;
	HeapTuple	heapTuple;

	nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32));
	if (nread == 0)
		return NULL;			/* end of file */
	if (nread != sizeof(uint32))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not read from hash-join temporary file: %m")));
	nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
	if (nread != sizeof(HeapTupleData))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not read from hash-join temporary file: %m")));
	heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
	memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
	heapTuple->t_data = (HeapTupleHeader)
		((char *) heapTuple + HEAPTUPLESIZE);
	nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
	if (nread != (size_t) htup.t_len)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not read from hash-join temporary file: %m")));
	return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
}

826

V
Vadim B. Mikheev 已提交
827
void
828
ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
V
Vadim B. Mikheev 已提交
829
{
830
	/*
B
Bruce Momjian 已提交
831 832 833 834 835
	 * In a multi-batch join, we currently have to do rescans the hard way,
	 * primarily because batch temp files may have already been released. But
	 * if it's a single-batch join, and there is no parameter change for the
	 * inner subnode, then we can just re-use the existing hash table without
	 * rebuilding it.
V
Vadim B. Mikheev 已提交
836
	 */
837
	if (node->hj_HashTable != NULL)
V
Vadim B. Mikheev 已提交
838
	{
839 840 841
		if (node->hj_HashTable->nbatch == 1 &&
			((PlanState *) node)->righttree->chgParam == NULL)
		{
842 843 844 845 846 847 848 849 850 851 852 853 854
			/*
			 * okay to reuse the hash table; needn't rescan inner, either.
			 *
			 * What we do need to do is reset our state about the emptiness
			 * of the outer relation, so that the new scan of the outer will
			 * update it correctly if it turns out to be empty this time.
			 * (There's no harm in clearing it now because ExecHashJoin won't
			 * need the info.  In the other cases, where the hash table
			 * doesn't exist or we are destroying it, we leave this state
			 * alone because ExecHashJoin will need it the first time
			 * through.)
			 */
			node->hj_OuterNotEmpty = false;
855 856 857 858 859 860
		}
		else
		{
			/* must destroy and rebuild hash table */
			ExecHashTableDestroy(node->hj_HashTable);
			node->hj_HashTable = NULL;
B
Bruce Momjian 已提交
861

862 863 864 865 866 867 868
			/*
			 * if chgParam of subnode is not null then plan will be re-scanned
			 * by first ExecProcNode.
			 */
			if (((PlanState *) node)->righttree->chgParam == NULL)
				ExecReScan(((PlanState *) node)->righttree, exprCtxt);
		}
V
Vadim B. Mikheev 已提交
869
	}
870

871
	/* Always reset intra-tuple state */
872
	node->hj_CurHashValue = 0;
873
	node->hj_CurBucketNo = 0;
874
	node->hj_CurTuple = NULL;
V
Vadim B. Mikheev 已提交
875

876
	node->js.ps.ps_OuterTupleSlot = NULL;
877 878 879
	node->js.ps.ps_TupFromTlist = false;
	node->hj_NeedNewOuter = true;
	node->hj_MatchedOuter = false;
880
	node->hj_FirstOuterTupleSlot = NULL;
881 882

	/*
B
Bruce Momjian 已提交
883 884
	 * if chgParam of subnode is not null then plan will be re-scanned by
	 * first ExecProcNode.
V
Vadim B. Mikheev 已提交
885
	 */
886 887
	if (((PlanState *) node)->lefttree->chgParam == NULL)
		ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
V
Vadim B. Mikheev 已提交
888
}