nodeHashjoin.c 23.4 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * nodeHashjoin.c
4
 *	  Routines to handle hash join nodes
5
 *
P
 
PostgreSQL Daemon 已提交
6
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
7
 * Portions Copyright (c) 1994, Regents of the University of California
8 9 10
 *
 *
 * IDENTIFICATION
11
 *	  $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.76 2005/11/20 19:49:07 tgl Exp $
12 13 14
 *
 *-------------------------------------------------------------------------
 */
15

B
Bruce Momjian 已提交
16
#include "postgres.h"
17 18

#include "executor/executor.h"
19
#include "executor/hashjoin.h"
20 21
#include "executor/nodeHash.h"
#include "executor/nodeHashjoin.h"
B
Bruce Momjian 已提交
22
#include "optimizer/clauses.h"
23 24
#include "utils/memutils.h"

25

26
static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
B
Bruce Momjian 已提交
27 28
						  HashJoinState *hjstate,
						  uint32 *hashvalue);
29
static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
30
						  BufFile *file,
31
						  uint32 *hashvalue,
B
Bruce Momjian 已提交
32
						  TupleTableSlot *tupleSlot);
33
static int	ExecHashJoinNewBatch(HashJoinState *hjstate);
34

35

36
/* ----------------------------------------------------------------
37
 *		ExecHashJoin
38
 *
39
 *		This function implements the Hybrid Hashjoin algorithm.
40 41 42
 *
 *		Note: the relation we build hash table on is the "inner"
 *			  the other one is "outer".
43 44
 * ----------------------------------------------------------------
 */
45
TupleTableSlot *				/* return: a tuple or NULL */
46
ExecHashJoin(HashJoinState *node)
47
{
48
	EState	   *estate;
49 50
	PlanState  *outerNode;
	HashState  *hashNode;
51 52
	List	   *joinqual;
	List	   *otherqual;
53
	TupleTableSlot *inntuple;
54
	ExprContext *econtext;
55
	ExprDoneCond isDone;
56 57
	HashJoinTable hashtable;
	HeapTuple	curtuple;
58
	TupleTableSlot *outerTupleSlot;
59 60
	uint32		hashvalue;
	int			batchno;
61

62 63
	/*
	 * get information from HashJoin node
64
	 */
65 66 67 68 69
	estate = node->js.ps.state;
	joinqual = node->js.joinqual;
	otherqual = node->js.ps.qual;
	hashNode = (HashState *) innerPlanState(node);
	outerNode = outerPlanState(node);
70

71
	/*
72
	 * get information from HashJoin state
73
	 */
74 75
	hashtable = node->hj_HashTable;
	econtext = node->js.ps.ps_ExprContext;
76

77
	/*
B
Bruce Momjian 已提交
78 79 80
	 * Check to see if we're still projecting out tuples from a previous join
	 * tuple (because there is a function-returning-set in the projection
	 * expressions).  If so, try to project another one.
81
	 */
82
	if (node->js.ps.ps_TupFromTlist)
83 84 85
	{
		TupleTableSlot *result;

86
		result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
87
		if (isDone == ExprMultipleResult)
88
			return result;
89
		/* Done with that source tuple... */
90
		node->js.ps.ps_TupFromTlist = false;
91
	}
92

93
	/*
B
Bruce Momjian 已提交
94 95 96
	 * If we're doing an IN join, we want to return at most one row per outer
	 * tuple; so we can stop scanning the inner scan if we matched on the
	 * previous try.
97
	 */
98
	if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
99 100
		node->hj_NeedNewOuter = true;

101 102
	/*
	 * Reset per-tuple memory context to free any expression evaluation
B
Bruce Momjian 已提交
103 104
	 * storage allocated in the previous tuple cycle.  Note this can't happen
	 * until we're done projecting out tuples from a join tuple.
105 106 107
	 */
	ResetExprContext(econtext);

108 109 110
	/*
	 * if this is the first call, build the hash table for inner relation
	 */
111
	if (hashtable == NULL)
112 113
	{
		/*
114
		 * If the outer relation is completely empty, we can quit without
B
Bruce Momjian 已提交
115 116 117 118 119 120 121
		 * building the hash table.  However, for an inner join it is only a
		 * win to check this when the outer relation's startup cost is less
		 * than the projected cost of building the hash table.	Otherwise it's
		 * best to build the hash table first and see if the inner relation is
		 * empty.  (When it's an outer join, we should always make this check,
		 * since we aren't going to be able to skip the join on the strength
		 * of an empty inner relation anyway.)
122
		 *
B
Bruce Momjian 已提交
123 124 125
		 * The only way to make the check is to try to fetch a tuple from the
		 * outer plan node.  If we succeed, we have to stash it away for later
		 * consumption by ExecHashJoinOuterGetTuple.
126
		 */
127 128
		if (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost ||
			node->js.jointype == JOIN_LEFT)
129
		{
130 131
			node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
			if (TupIsNull(node->hj_FirstOuterTupleSlot))
132 133
				return NULL;
		}
134 135
		else
			node->hj_FirstOuterTupleSlot = NULL;
136 137

		/*
138
		 * create the hash table
139
		 */
140 141
		hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
										node->hj_HashOperators);
142
		node->hj_HashTable = hashtable;
143

144
		/*
145
		 * execute the Hash node, to build the hash table
146
		 */
147
		hashNode->hashtable = hashtable;
148
		(void) MultiExecProcNode((PlanState *) hashNode);
149

150
		/*
B
Bruce Momjian 已提交
151 152
		 * If the inner relation is completely empty, and we're not doing an
		 * outer join, we can quit without scanning the outer relation.
153
		 */
154
		if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
155
		{
156
			ExecHashTableDestroy(hashtable);
157
			node->hj_HashTable = NULL;
158
			node->hj_FirstOuterTupleSlot = NULL;
159 160 161
			return NULL;
		}

162
		/*
163 164
		 * need to remember whether nbatch has increased since we began
		 * scanning the outer relation
165
		 */
166
		hashtable->nbatch_outstart = hashtable->nbatch;
167
	}
168

169
	/*
170
	 * run the hash join process
171
	 */
172
	for (;;)
173
	{
174
		/*
175
		 * If we don't have an outer tuple, get the next one
176
		 */
177
		if (node->hj_NeedNewOuter)
178
		{
179 180 181
			outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
													   node,
													   &hashvalue);
182
			if (TupIsNull(outerTupleSlot))
183 184 185 186 187 188 189 190 191 192 193
			{
				/* end of join */
				return NULL;
			}

			node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
			econtext->ecxt_outertuple = outerTupleSlot;
			node->hj_NeedNewOuter = false;
			node->hj_MatchedOuter = false;

			/*
B
Bruce Momjian 已提交
194 195
			 * now we have an outer tuple, find the corresponding bucket for
			 * this tuple from the hash table
196 197 198 199 200 201 202
			 */
			node->hj_CurHashValue = hashvalue;
			ExecHashGetBucketAndBatch(hashtable, hashvalue,
									  &node->hj_CurBucketNo, &batchno);
			node->hj_CurTuple = NULL;

			/*
B
Bruce Momjian 已提交
203 204
			 * Now we've got an outer tuple and the corresponding hash bucket,
			 * but this tuple may not belong to the current batch.
205 206 207 208
			 */
			if (batchno != hashtable->curbatch)
			{
				/*
B
Bruce Momjian 已提交
209 210
				 * Need to postpone this outer tuple to a later batch. Save it
				 * in the corresponding outer-batch file.
211 212 213 214 215 216
				 */
				Assert(batchno > hashtable->curbatch);
				ExecHashJoinSaveTuple(ExecFetchSlotTuple(outerTupleSlot),
									  hashvalue,
									  &hashtable->outerBatchFile[batchno]);
				node->hj_NeedNewOuter = true;
B
Bruce Momjian 已提交
217
				continue;		/* loop around for a new outer tuple */
218
			}
219 220
		}

221 222
		/*
		 * OK, scan the selected hash bucket for matches
223
		 */
224
		for (;;)
225
		{
226
			curtuple = ExecScanHashBucket(node, econtext);
227 228
			if (curtuple == NULL)
				break;			/* out of matches */
B
Bruce Momjian 已提交
229

230
			/*
231
			 * we've got a match, but still need to test non-hashed quals
232
			 */
233
			inntuple = ExecStoreTuple(curtuple,
234
									  node->hj_HashTupleSlot,
235 236 237
									  InvalidBuffer,
									  false);	/* don't pfree this tuple */
			econtext->ecxt_innertuple = inntuple;
238

239
			/* reset temp memory each time to avoid leaks from qual expr */
240 241
			ResetExprContext(econtext);

242 243
			/*
			 * if we pass the qual, then save state for next call and have
B
Bruce Momjian 已提交
244 245
			 * ExecProject form the projection, store it in the tuple table,
			 * and return the slot.
246
			 *
B
Bruce Momjian 已提交
247 248
			 * Only the joinquals determine MatchedOuter status, but all quals
			 * must pass to actually return the tuple.
249
			 */
250
			if (joinqual == NIL || ExecQual(joinqual, econtext, false))
251
			{
252
				node->hj_MatchedOuter = true;
253

254
				if (otherqual == NIL || ExecQual(otherqual, econtext, false))
255
				{
256 257
					TupleTableSlot *result;

258
					result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
259 260 261

					if (isDone != ExprEndResult)
					{
262
						node->js.ps.ps_TupFromTlist =
263 264 265
							(isDone == ExprMultipleResult);
						return result;
					}
266
				}
267

B
Bruce Momjian 已提交
268
				/*
B
Bruce Momjian 已提交
269
				 * If we didn't return a tuple, may need to set NeedNewOuter
B
Bruce Momjian 已提交
270
				 */
271 272 273 274 275
				if (node->js.jointype == JOIN_IN)
				{
					node->hj_NeedNewOuter = true;
					break;		/* out of loop over hash bucket */
				}
276 277 278
			}
		}

279 280
		/*
		 * Now the current outer tuple has run out of matches, so check
B
Bruce Momjian 已提交
281 282
		 * whether to emit a dummy outer-join tuple. If not, loop around to
		 * get a new outer tuple.
283
		 */
284
		node->hj_NeedNewOuter = true;
285

286 287
		if (!node->hj_MatchedOuter &&
			node->js.jointype == JOIN_LEFT)
288 289
		{
			/*
B
Bruce Momjian 已提交
290 291 292
			 * We are doing an outer join and there were no join matches for
			 * this outer tuple.  Generate a fake join tuple with nulls for
			 * the inner tuple, and return it if it passes the non-join quals.
293
			 */
294
			econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
295 296 297

			if (ExecQual(otherqual, econtext, false))
			{
298
				/*
B
Bruce Momjian 已提交
299 300
				 * qualification was satisfied so we project and return the
				 * slot containing the result tuple using ExecProject().
301 302 303
				 */
				TupleTableSlot *result;

304
				result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
305 306 307

				if (isDone != ExprEndResult)
				{
308
					node->js.ps.ps_TupFromTlist =
309 310 311 312 313
						(isDone == ExprMultipleResult);
					return result;
				}
			}
		}
314
	}
315 316 317
}

/* ----------------------------------------------------------------
318
 *		ExecInitHashJoin
319
 *
320
 *		Init routine for HashJoin node.
321 322
 * ----------------------------------------------------------------
 */
323 324
HashJoinState *
ExecInitHashJoin(HashJoin *node, EState *estate)
325
{
326 327 328
	HashJoinState *hjstate;
	Plan	   *outerNode;
	Hash	   *hashNode;
329 330
	List	   *lclauses;
	List	   *rclauses;
331
	List	   *hoperators;
332
	ListCell   *l;
333

334
	/*
335 336 337
	 * create state structure
	 */
	hjstate = makeNode(HashJoinState);
338 339
	hjstate->js.ps.plan = (Plan *) node;
	hjstate->js.ps.state = estate;
340

341 342
	/*
	 * Miscellaneous initialization
343
	 *
344
	 * create expression context for node
345
	 */
346 347 348 349 350 351
	ExecAssignExprContext(estate, &hjstate->js.ps);

	/*
	 * initialize child expressions
	 */
	hjstate->js.ps.targetlist = (List *)
352
		ExecInitExpr((Expr *) node->join.plan.targetlist,
353 354
					 (PlanState *) hjstate);
	hjstate->js.ps.qual = (List *)
355
		ExecInitExpr((Expr *) node->join.plan.qual,
356 357 358
					 (PlanState *) hjstate);
	hjstate->js.jointype = node->join.jointype;
	hjstate->js.joinqual = (List *)
359
		ExecInitExpr((Expr *) node->join.joinqual,
360 361
					 (PlanState *) hjstate);
	hjstate->hashclauses = (List *)
362
		ExecInitExpr((Expr *) node->hashclauses,
363
					 (PlanState *) hjstate);
364

365
	/*
366
	 * initialize child nodes
367
	 */
368 369
	outerNode = outerPlan(node);
	hashNode = (Hash *) innerPlan(node);
370

371 372
	outerPlanState(hjstate) = ExecInitNode(outerNode, estate);
	innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate);
373

374
#define HASHJOIN_NSLOTS 3
375 376 377

	/*
	 * tuple table initialization
378
	 */
379
	ExecInitResultTupleSlot(estate, &hjstate->js.ps);
380 381 382 383 384
	hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);

	switch (node->join.jointype)
	{
		case JOIN_INNER:
385
		case JOIN_IN:
386 387 388 389
			break;
		case JOIN_LEFT:
			hjstate->hj_NullInnerTupleSlot =
				ExecInitNullTupleSlot(estate,
B
Bruce Momjian 已提交
390
								 ExecGetResultType(innerPlanState(hjstate)));
391 392
			break;
		default:
393
			elog(ERROR, "unrecognized join type: %d",
394 395 396
				 (int) node->join.jointype);
	}

397
	/*
B
Bruce Momjian 已提交
398 399 400 401 402
	 * now for some voodoo.  our temporary tuple slot is actually the result
	 * tuple slot of the Hash node (which is our inner plan).  we do this
	 * because Hash nodes don't return tuples via ExecProcNode() -- instead
	 * the hash join node uses ExecScanHashBucket() to get at the contents of
	 * the hash table.	-cim 6/9/91
403 404
	 */
	{
405 406
		HashState  *hashstate = (HashState *) innerPlanState(hjstate);
		TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
407 408 409 410

		hjstate->hj_HashTupleSlot = slot;
	}

411 412
	/*
	 * initialize tuple type and projection info
413
	 */
414 415
	ExecAssignResultTypeFromTL(&hjstate->js.ps);
	ExecAssignProjectionInfo(&hjstate->js.ps);
416

417
	ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
418
						  ExecGetResultType(outerPlanState(hjstate)),
419
						  false);
420

421 422
	/*
	 * initialize hash-specific info
423
	 */
424
	hjstate->hj_HashTable = NULL;
425
	hjstate->hj_FirstOuterTupleSlot = NULL;
426

427
	hjstate->hj_CurHashValue = 0;
428
	hjstate->hj_CurBucketNo = 0;
429
	hjstate->hj_CurTuple = NULL;
430 431

	/*
B
Bruce Momjian 已提交
432 433 434 435
	 * Deconstruct the hash clauses into outer and inner argument values, so
	 * that we can evaluate those subexpressions separately.  Also make a list
	 * of the hash operator OIDs, in preparation for looking up the hash
	 * functions to use.
436
	 */
437 438
	lclauses = NIL;
	rclauses = NIL;
439
	hoperators = NIL;
440
	foreach(l, hjstate->hashclauses)
441
	{
442
		FuncExprState *fstate = (FuncExprState *) lfirst(l);
443
		OpExpr	   *hclause;
444

445 446
		Assert(IsA(fstate, FuncExprState));
		hclause = (OpExpr *) fstate->xprstate.expr;
447
		Assert(IsA(hclause, OpExpr));
448
		lclauses = lappend(lclauses, linitial(fstate->args));
449
		rclauses = lappend(rclauses, lsecond(fstate->args));
450
		hoperators = lappend_oid(hoperators, hclause->opno);
451
	}
452 453
	hjstate->hj_OuterHashKeys = lclauses;
	hjstate->hj_InnerHashKeys = rclauses;
454
	hjstate->hj_HashOperators = hoperators;
455 456
	/* child Hash node needs to evaluate inner hash keys, too */
	((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
457

458 459
	hjstate->js.ps.ps_OuterTupleSlot = NULL;
	hjstate->js.ps.ps_TupFromTlist = false;
460 461
	hjstate->hj_NeedNewOuter = true;
	hjstate->hj_MatchedOuter = false;
462

463
	return hjstate;
464 465 466
}

int
467
ExecCountSlotsHashJoin(HashJoin *node)
468
{
469
	return ExecCountSlotsNode(outerPlan(node)) +
470 471
		ExecCountSlotsNode(innerPlan(node)) +
		HASHJOIN_NSLOTS;
472 473 474
}

/* ----------------------------------------------------------------
475
 *		ExecEndHashJoin
476
 *
477
 *		clean up routine for HashJoin node
478 479 480
 * ----------------------------------------------------------------
 */
void
481
ExecEndHashJoin(HashJoinState *node)
482
{
483
	/*
484
	 * Free hash table
485
	 */
486
	if (node->hj_HashTable)
487
	{
488 489
		ExecHashTableDestroy(node->hj_HashTable);
		node->hj_HashTable = NULL;
490
		node->hj_FirstOuterTupleSlot = NULL;
491 492
	}

493
	/*
494
	 * Free the exprcontext
495
	 */
496
	ExecFreeExprContext(&node->js.ps);
497

498 499
	/*
	 * clean out the tuple table
500
	 */
501 502 503
	ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
	ExecClearTuple(node->hj_OuterTupleSlot);
	ExecClearTuple(node->hj_HashTupleSlot);
504

505 506 507 508 509
	/*
	 * clean up subtrees
	 */
	ExecEndNode(outerPlanState(node));
	ExecEndNode(innerPlanState(node));
510 511
}

512 513
/*
 * ExecHashJoinOuterGetTuple
514
 *
515
 *		get the next outer tuple for hashjoin: either by
516 517 518 519 520 521
 *		executing a plan node in the first pass, or from
 *		the temp files for the hashjoin batches.
 *
 * Returns a null slot if no more outer tuples.  On success, the tuple's
 * hash value is stored at *hashvalue --- this is either originally computed,
 * or re-read from the temp file.
522 523
 */
static TupleTableSlot *
524 525 526
ExecHashJoinOuterGetTuple(PlanState *outerNode,
						  HashJoinState *hjstate,
						  uint32 *hashvalue)
527
{
B
Bruce Momjian 已提交
528 529
	HashJoinTable hashtable = hjstate->hj_HashTable;
	int			curbatch = hashtable->curbatch;
530 531 532 533
	TupleTableSlot *slot;

	if (curbatch == 0)
	{							/* if it is the first pass */
B
Bruce Momjian 已提交
534

535 536 537 538 539 540 541 542 543
		/*
		 * Check to see if first outer tuple was already fetched by
		 * ExecHashJoin() and not used yet.
		 */
		slot = hjstate->hj_FirstOuterTupleSlot;
		if (!TupIsNull(slot))
			hjstate->hj_FirstOuterTupleSlot = NULL;
		else
			slot = ExecProcNode(outerNode);
B
Bruce Momjian 已提交
544
		if (!TupIsNull(slot))
545 546 547 548 549 550 551 552 553 554
		{
			/*
			 * We have to compute the tuple's hash value.
			 */
			ExprContext *econtext = hjstate->js.ps.ps_ExprContext;

			econtext->ecxt_outertuple = slot;
			*hashvalue = ExecHashGetHashValue(hashtable, econtext,
											  hjstate->hj_OuterHashKeys);

555
			return slot;
556
		}
B
Bruce Momjian 已提交
557

558
		/*
B
Bruce Momjian 已提交
559 560
		 * We have just reached the end of the first pass. Try to switch to a
		 * saved batch.
561 562
		 */
		curbatch = ExecHashJoinNewBatch(hjstate);
563 564 565
	}

	/*
B
Bruce Momjian 已提交
566 567 568
	 * Try to read from a temp file. Loop allows us to advance to new batches
	 * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
	 * don't try to optimize this loop.
569
	 */
570
	while (curbatch < hashtable->nbatch)
571 572
	{
		slot = ExecHashJoinGetSavedTuple(hjstate,
573 574
										 hashtable->outerBatchFile[curbatch],
										 hashvalue,
575
										 hjstate->hj_OuterTupleSlot);
B
Bruce Momjian 已提交
576
		if (!TupIsNull(slot))
577 578 579 580 581 582
			return slot;
		curbatch = ExecHashJoinNewBatch(hjstate);
	}

	/* Out of batches... */
	return NULL;
583 584
}

585 586
/*
 * ExecHashJoinNewBatch
587
 *		switch to a new hashjoin batch
588 589 590
 *
 * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
 * We will never return a batch number that has an empty outer batch file.
591
 */
592
static int
593
ExecHashJoinNewBatch(HashJoinState *hjstate)
594
{
595
	HashJoinTable hashtable = hjstate->hj_HashTable;
596 597
	int			nbatch;
	int			curbatch;
B
Bruce Momjian 已提交
598
	BufFile    *innerFile;
599
	TupleTableSlot *slot;
600
	uint32		hashvalue;
601

602 603 604 605 606
start_over:
	nbatch = hashtable->nbatch;
	curbatch = hashtable->curbatch;

	if (curbatch > 0)
607 608
	{
		/*
B
Bruce Momjian 已提交
609 610
		 * We no longer need the previous outer batch file; close it right
		 * away to free disk space.
611
		 */
612 613 614
		if (hashtable->outerBatchFile[curbatch])
			BufFileClose(hashtable->outerBatchFile[curbatch]);
		hashtable->outerBatchFile[curbatch] = NULL;
615 616
	}

617
	/*
618 619 620 621
	 * We can always skip over any batches that are completely empty on both
	 * sides.  We can sometimes skip over batches that are empty on only one
	 * side, but there are exceptions:
	 *
B
Bruce Momjian 已提交
622 623
	 * 1. In a LEFT JOIN, we have to process outer batches even if the inner
	 * batch is empty.
624
	 *
B
Bruce Momjian 已提交
625 626 627
	 * 2. If we have increased nbatch since the initial estimate, we have to scan
	 * inner batches since they might contain tuples that need to be
	 * reassigned to later inner batches.
628
	 *
B
Bruce Momjian 已提交
629 630 631
	 * 3. Similarly, if we have increased nbatch since starting the outer scan,
	 * we have to rescan outer batches in case they contain tuples that need
	 * to be reassigned.
632
	 */
633 634 635 636
	curbatch++;
	while (curbatch < nbatch &&
		   (hashtable->outerBatchFile[curbatch] == NULL ||
			hashtable->innerBatchFile[curbatch] == NULL))
637
	{
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655
		if (hashtable->outerBatchFile[curbatch] &&
			hjstate->js.jointype == JOIN_LEFT)
			break;				/* must process due to rule 1 */
		if (hashtable->innerBatchFile[curbatch] &&
			nbatch != hashtable->nbatch_original)
			break;				/* must process due to rule 2 */
		if (hashtable->outerBatchFile[curbatch] &&
			nbatch != hashtable->nbatch_outstart)
			break;				/* must process due to rule 3 */
		/* We can ignore this batch. */
		/* Release associated temp files right away. */
		if (hashtable->innerBatchFile[curbatch])
			BufFileClose(hashtable->innerBatchFile[curbatch]);
		hashtable->innerBatchFile[curbatch] = NULL;
		if (hashtable->outerBatchFile[curbatch])
			BufFileClose(hashtable->outerBatchFile[curbatch]);
		hashtable->outerBatchFile[curbatch] = NULL;
		curbatch++;
656
	}
657

658 659
	if (curbatch >= nbatch)
		return curbatch;		/* no more batches */
660

661
	hashtable->curbatch = curbatch;
662 663

	/*
664
	 * Reload the hash table with the new inner batch (which could be empty)
665
	 */
666
	ExecHashTableReset(hashtable);
667

668
	innerFile = hashtable->innerBatchFile[curbatch];
669

670
	if (innerFile != NULL)
671
	{
672 673 674
		if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
			ereport(ERROR,
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
675
				   errmsg("could not rewind hash-join temporary file: %m")));
676 677 678 679 680 681 682

		while ((slot = ExecHashJoinGetSavedTuple(hjstate,
												 innerFile,
												 &hashvalue,
												 hjstate->hj_HashTupleSlot)))
		{
			/*
B
Bruce Momjian 已提交
683 684
			 * NOTE: some tuples may be sent to future batches.  Also, it is
			 * possible for hashtable->nbatch to be increased here!
685
			 */
686 687 688
			ExecHashTableInsert(hashtable,
								ExecFetchSlotTuple(slot),
								hashvalue);
689 690 691 692 693 694 695 696
		}

		/*
		 * after we build the hash table, the inner batch file is no longer
		 * needed
		 */
		BufFileClose(innerFile);
		hashtable->innerBatchFile[curbatch] = NULL;
697 698 699
	}

	/*
700
	 * If there's no outer batch file, advance to next batch.
701
	 */
702 703
	if (hashtable->outerBatchFile[curbatch] == NULL)
		goto start_over;
704

705 706 707 708 709 710 711 712 713
	/*
	 * Rewind outer batch file, so that we can start reading it.
	 */
	if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not rewind hash-join temporary file: %m")));

	return curbatch;
714 715
}

716 717 718
/*
 * ExecHashJoinSaveTuple
 *		save a tuple to a batch file.
719
 *
720 721 722
 * The data recorded in the file for each tuple is its hash value,
 * then an image of its HeapTupleData (with meaningless t_data pointer)
 * followed by the HeapTupleHeader and tuple data.
723
 *
724 725 726
 * Note: it is important always to call this in the regular executor
 * context, not in a shorter-lived context; else the temp file buffers
 * will get messed up.
727
 */
728
void
729 730
ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
					  BufFile **fileptr)
731
{
B
Bruce Momjian 已提交
732
	BufFile    *file = *fileptr;
B
Bruce Momjian 已提交
733
	size_t		written;
734

735 736 737 738 739 740 741 742 743 744 745 746 747
	if (file == NULL)
	{
		/* First write to this batch file, so open it. */
		file = BufFileCreateTemp(false);
		*fileptr = file;
	}

	written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
	if (written != sizeof(uint32))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write to hash-join temporary file: %m")));

748 749
	written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
	if (written != sizeof(HeapTupleData))
750 751
		ereport(ERROR,
				(errcode_for_file_access(),
752 753
				 errmsg("could not write to hash-join temporary file: %m")));

754 755
	written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
	if (written != (size_t) heapTuple->t_len)
756 757
		ereport(ERROR,
				(errcode_for_file_access(),
758
				 errmsg("could not write to hash-join temporary file: %m")));
759
}
V
Vadim B. Mikheev 已提交
760

761 762
/*
 * ExecHashJoinGetSavedTuple
B
Bruce Momjian 已提交
763
 *		read the next tuple from a batch file.	Return NULL if no more.
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801
 *
 * On success, *hashvalue is set to the tuple's hash value, and the tuple
 * itself is stored in the given slot.
 */
static TupleTableSlot *
ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
						  BufFile *file,
						  uint32 *hashvalue,
						  TupleTableSlot *tupleSlot)
{
	HeapTupleData htup;
	size_t		nread;
	HeapTuple	heapTuple;

	nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32));
	if (nread == 0)
		return NULL;			/* end of file */
	if (nread != sizeof(uint32))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not read from hash-join temporary file: %m")));
	nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
	if (nread != sizeof(HeapTupleData))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not read from hash-join temporary file: %m")));
	heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
	memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
	heapTuple->t_data = (HeapTupleHeader)
		((char *) heapTuple + HEAPTUPLESIZE);
	nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
	if (nread != (size_t) htup.t_len)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not read from hash-join temporary file: %m")));
	return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
}

802

V
Vadim B. Mikheev 已提交
803
void
804
ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
V
Vadim B. Mikheev 已提交
805
{
806
	/*
B
Bruce Momjian 已提交
807 808
	 * If we haven't yet built the hash table then we can just return; nothing
	 * done yet, so nothing to undo.
809
	 */
810
	if (node->hj_HashTable == NULL)
V
Vadim B. Mikheev 已提交
811
		return;
812 813

	/*
B
Bruce Momjian 已提交
814 815 816 817 818
	 * In a multi-batch join, we currently have to do rescans the hard way,
	 * primarily because batch temp files may have already been released. But
	 * if it's a single-batch join, and there is no parameter change for the
	 * inner subnode, then we can just re-use the existing hash table without
	 * rebuilding it.
V
Vadim B. Mikheev 已提交
819
	 */
820
	if (node->hj_HashTable->nbatch == 1 &&
821 822 823 824 825
		((PlanState *) node)->righttree->chgParam == NULL)
	{
		/* okay to reuse the hash table; needn't rescan inner, either */
	}
	else
V
Vadim B. Mikheev 已提交
826
	{
827
		/* must destroy and rebuild hash table */
828 829
		ExecHashTableDestroy(node->hj_HashTable);
		node->hj_HashTable = NULL;
830
		node->hj_FirstOuterTupleSlot = NULL;
B
Bruce Momjian 已提交
831

832
		/*
B
Bruce Momjian 已提交
833 834
		 * if chgParam of subnode is not null then plan will be re-scanned by
		 * first ExecProcNode.
835 836 837
		 */
		if (((PlanState *) node)->righttree->chgParam == NULL)
			ExecReScan(((PlanState *) node)->righttree, exprCtxt);
V
Vadim B. Mikheev 已提交
838
	}
839

840
	/* Always reset intra-tuple state */
841
	node->hj_CurHashValue = 0;
842
	node->hj_CurBucketNo = 0;
843
	node->hj_CurTuple = NULL;
V
Vadim B. Mikheev 已提交
844

845
	node->js.ps.ps_OuterTupleSlot = NULL;
846 847 848
	node->js.ps.ps_TupFromTlist = false;
	node->hj_NeedNewOuter = true;
	node->hj_MatchedOuter = false;
849 850

	/*
B
Bruce Momjian 已提交
851 852
	 * if chgParam of subnode is not null then plan will be re-scanned by
	 * first ExecProcNode.
V
Vadim B. Mikheev 已提交
853
	 */
854 855
	if (((PlanState *) node)->lefttree->chgParam == NULL)
		ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
V
Vadim B. Mikheev 已提交
856
}