nodeHashjoin.c 24.3 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * nodeHashjoin.c
4
 *	  Routines to handle hash join nodes
5
 *
6
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
7
 * Portions Copyright (c) 1994, Regents of the University of California
8 9 10
 *
 *
 * IDENTIFICATION
11
 *	  $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.86 2007/01/05 22:19:28 momjian Exp $
12 13 14
 *
 *-------------------------------------------------------------------------
 */
15

B
Bruce Momjian 已提交
16
#include "postgres.h"
17 18

#include "executor/executor.h"
19
#include "executor/hashjoin.h"
20 21
#include "executor/nodeHash.h"
#include "executor/nodeHashjoin.h"
22 23
#include "utils/memutils.h"

24

25
static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
B
Bruce Momjian 已提交
26 27
						  HashJoinState *hjstate,
						  uint32 *hashvalue);
28
static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
29
						  BufFile *file,
30
						  uint32 *hashvalue,
B
Bruce Momjian 已提交
31
						  TupleTableSlot *tupleSlot);
32
static int	ExecHashJoinNewBatch(HashJoinState *hjstate);
33

34

35
/* ----------------------------------------------------------------
36
 *		ExecHashJoin
37
 *
38
 *		This function implements the Hybrid Hashjoin algorithm.
39 40 41
 *
 *		Note: the relation we build hash table on is the "inner"
 *			  the other one is "outer".
42 43
 * ----------------------------------------------------------------
 */
44
TupleTableSlot *				/* return: a tuple or NULL */
45
ExecHashJoin(HashJoinState *node)
46
{
47
	EState	   *estate;
48 49
	PlanState  *outerNode;
	HashState  *hashNode;
50 51
	List	   *joinqual;
	List	   *otherqual;
52
	TupleTableSlot *inntuple;
53
	ExprContext *econtext;
54
	ExprDoneCond isDone;
55
	HashJoinTable hashtable;
56
	HashJoinTuple curtuple;
57
	TupleTableSlot *outerTupleSlot;
58 59
	uint32		hashvalue;
	int			batchno;
60

61 62
	/*
	 * get information from HashJoin node
63
	 */
64 65 66 67 68
	estate = node->js.ps.state;
	joinqual = node->js.joinqual;
	otherqual = node->js.ps.qual;
	hashNode = (HashState *) innerPlanState(node);
	outerNode = outerPlanState(node);
69

70
	/*
71
	 * get information from HashJoin state
72
	 */
73 74
	hashtable = node->hj_HashTable;
	econtext = node->js.ps.ps_ExprContext;
75

76
	/*
B
Bruce Momjian 已提交
77 78 79
	 * Check to see if we're still projecting out tuples from a previous join
	 * tuple (because there is a function-returning-set in the projection
	 * expressions).  If so, try to project another one.
80
	 */
81
	if (node->js.ps.ps_TupFromTlist)
82 83 84
	{
		TupleTableSlot *result;

85
		result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
86
		if (isDone == ExprMultipleResult)
87
			return result;
88
		/* Done with that source tuple... */
89
		node->js.ps.ps_TupFromTlist = false;
90
	}
91

92
	/*
B
Bruce Momjian 已提交
93 94 95
	 * If we're doing an IN join, we want to return at most one row per outer
	 * tuple; so we can stop scanning the inner scan if we matched on the
	 * previous try.
96
	 */
97
	if (node->js.jointype == JOIN_IN && node->hj_MatchedOuter)
98 99
		node->hj_NeedNewOuter = true;

100 101
	/*
	 * Reset per-tuple memory context to free any expression evaluation
B
Bruce Momjian 已提交
102 103
	 * storage allocated in the previous tuple cycle.  Note this can't happen
	 * until we're done projecting out tuples from a join tuple.
104 105 106
	 */
	ResetExprContext(econtext);

107 108 109
	/*
	 * if this is the first call, build the hash table for inner relation
	 */
110
	if (hashtable == NULL)
111 112
	{
		/*
113
		 * If the outer relation is completely empty, we can quit without
B
Bruce Momjian 已提交
114 115 116 117 118 119 120
		 * building the hash table.  However, for an inner join it is only a
		 * win to check this when the outer relation's startup cost is less
		 * than the projected cost of building the hash table.	Otherwise it's
		 * best to build the hash table first and see if the inner relation is
		 * empty.  (When it's an outer join, we should always make this check,
		 * since we aren't going to be able to skip the join on the strength
		 * of an empty inner relation anyway.)
121
		 *
B
Bruce Momjian 已提交
122 123 124 125 126
		 * If we are rescanning the join, we make use of information gained on
		 * the previous scan: don't bother to try the prefetch if the previous
		 * scan found the outer relation nonempty.	This is not 100% reliable
		 * since with new parameters the outer relation might yield different
		 * results, but it's a good heuristic.
127
		 *
B
Bruce Momjian 已提交
128 129 130
		 * The only way to make the check is to try to fetch a tuple from the
		 * outer plan node.  If we succeed, we have to stash it away for later
		 * consumption by ExecHashJoinOuterGetTuple.
131
		 */
132 133 134
		if (node->js.jointype == JOIN_LEFT ||
			(outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
			 !node->hj_OuterNotEmpty))
135
		{
136 137
			node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
			if (TupIsNull(node->hj_FirstOuterTupleSlot))
138 139
			{
				node->hj_OuterNotEmpty = false;
140
				return NULL;
141 142 143
			}
			else
				node->hj_OuterNotEmpty = true;
144
		}
145 146
		else
			node->hj_FirstOuterTupleSlot = NULL;
147 148

		/*
149
		 * create the hash table
150
		 */
151 152
		hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
										node->hj_HashOperators);
153
		node->hj_HashTable = hashtable;
154

155
		/*
156
		 * execute the Hash node, to build the hash table
157
		 */
158
		hashNode->hashtable = hashtable;
159
		(void) MultiExecProcNode((PlanState *) hashNode);
160

161
		/*
B
Bruce Momjian 已提交
162 163
		 * If the inner relation is completely empty, and we're not doing an
		 * outer join, we can quit without scanning the outer relation.
164
		 */
165
		if (hashtable->totalTuples == 0 && node->js.jointype != JOIN_LEFT)
166 167
			return NULL;

168
		/*
169 170
		 * need to remember whether nbatch has increased since we began
		 * scanning the outer relation
171
		 */
172
		hashtable->nbatch_outstart = hashtable->nbatch;
173 174 175

		/*
		 * Reset OuterNotEmpty for scan.  (It's OK if we fetched a tuple
B
Bruce Momjian 已提交
176 177
		 * above, because ExecHashJoinOuterGetTuple will immediately set it
		 * again.)
178 179
		 */
		node->hj_OuterNotEmpty = false;
180
	}
181

182
	/*
183
	 * run the hash join process
184
	 */
185
	for (;;)
186
	{
187
		/*
188
		 * If we don't have an outer tuple, get the next one
189
		 */
190
		if (node->hj_NeedNewOuter)
191
		{
192 193 194
			outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
													   node,
													   &hashvalue);
195
			if (TupIsNull(outerTupleSlot))
196 197 198 199 200 201 202 203 204 205 206
			{
				/* end of join */
				return NULL;
			}

			node->js.ps.ps_OuterTupleSlot = outerTupleSlot;
			econtext->ecxt_outertuple = outerTupleSlot;
			node->hj_NeedNewOuter = false;
			node->hj_MatchedOuter = false;

			/*
B
Bruce Momjian 已提交
207 208
			 * now we have an outer tuple, find the corresponding bucket for
			 * this tuple from the hash table
209 210 211 212 213 214 215
			 */
			node->hj_CurHashValue = hashvalue;
			ExecHashGetBucketAndBatch(hashtable, hashvalue,
									  &node->hj_CurBucketNo, &batchno);
			node->hj_CurTuple = NULL;

			/*
B
Bruce Momjian 已提交
216 217
			 * Now we've got an outer tuple and the corresponding hash bucket,
			 * but this tuple may not belong to the current batch.
218 219 220 221
			 */
			if (batchno != hashtable->curbatch)
			{
				/*
B
Bruce Momjian 已提交
222 223
				 * Need to postpone this outer tuple to a later batch. Save it
				 * in the corresponding outer-batch file.
224 225
				 */
				Assert(batchno > hashtable->curbatch);
226
				ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
227 228 229
									  hashvalue,
									  &hashtable->outerBatchFile[batchno]);
				node->hj_NeedNewOuter = true;
B
Bruce Momjian 已提交
230
				continue;		/* loop around for a new outer tuple */
231
			}
232 233
		}

234 235
		/*
		 * OK, scan the selected hash bucket for matches
236
		 */
237
		for (;;)
238
		{
239
			curtuple = ExecScanHashBucket(node, econtext);
240 241
			if (curtuple == NULL)
				break;			/* out of matches */
B
Bruce Momjian 已提交
242

243
			/*
244
			 * we've got a match, but still need to test non-hashed quals
245
			 */
246 247 248
			inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(curtuple),
											 node->hj_HashTupleSlot,
											 false);	/* don't pfree */
249
			econtext->ecxt_innertuple = inntuple;
250

251
			/* reset temp memory each time to avoid leaks from qual expr */
252 253
			ResetExprContext(econtext);

254 255
			/*
			 * if we pass the qual, then save state for next call and have
B
Bruce Momjian 已提交
256 257
			 * ExecProject form the projection, store it in the tuple table,
			 * and return the slot.
258
			 *
B
Bruce Momjian 已提交
259 260
			 * Only the joinquals determine MatchedOuter status, but all quals
			 * must pass to actually return the tuple.
261
			 */
262
			if (joinqual == NIL || ExecQual(joinqual, econtext, false))
263
			{
264
				node->hj_MatchedOuter = true;
265

266
				if (otherqual == NIL || ExecQual(otherqual, econtext, false))
267
				{
268 269
					TupleTableSlot *result;

270
					result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
271 272 273

					if (isDone != ExprEndResult)
					{
274
						node->js.ps.ps_TupFromTlist =
275 276 277
							(isDone == ExprMultipleResult);
						return result;
					}
278
				}
279

B
Bruce Momjian 已提交
280
				/*
B
Bruce Momjian 已提交
281
				 * If we didn't return a tuple, may need to set NeedNewOuter
B
Bruce Momjian 已提交
282
				 */
283 284 285 286 287
				if (node->js.jointype == JOIN_IN)
				{
					node->hj_NeedNewOuter = true;
					break;		/* out of loop over hash bucket */
				}
288 289 290
			}
		}

291 292
		/*
		 * Now the current outer tuple has run out of matches, so check
B
Bruce Momjian 已提交
293 294
		 * whether to emit a dummy outer-join tuple. If not, loop around to
		 * get a new outer tuple.
295
		 */
296
		node->hj_NeedNewOuter = true;
297

298 299
		if (!node->hj_MatchedOuter &&
			node->js.jointype == JOIN_LEFT)
300 301
		{
			/*
B
Bruce Momjian 已提交
302 303 304
			 * We are doing an outer join and there were no join matches for
			 * this outer tuple.  Generate a fake join tuple with nulls for
			 * the inner tuple, and return it if it passes the non-join quals.
305
			 */
306
			econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
307 308 309

			if (ExecQual(otherqual, econtext, false))
			{
310
				/*
B
Bruce Momjian 已提交
311 312
				 * qualification was satisfied so we project and return the
				 * slot containing the result tuple using ExecProject().
313 314 315
				 */
				TupleTableSlot *result;

316
				result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
317 318 319

				if (isDone != ExprEndResult)
				{
320
					node->js.ps.ps_TupFromTlist =
321 322 323 324 325
						(isDone == ExprMultipleResult);
					return result;
				}
			}
		}
326
	}
327 328 329
}

/* ----------------------------------------------------------------
330
 *		ExecInitHashJoin
331
 *
332
 *		Init routine for HashJoin node.
333 334
 * ----------------------------------------------------------------
 */
335
HashJoinState *
336
ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
337
{
338 339 340
	HashJoinState *hjstate;
	Plan	   *outerNode;
	Hash	   *hashNode;
341 342
	List	   *lclauses;
	List	   *rclauses;
343
	List	   *hoperators;
344
	ListCell   *l;
345

346 347 348
	/* check for unsupported flags */
	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));

349
	/*
350 351 352
	 * create state structure
	 */
	hjstate = makeNode(HashJoinState);
353 354
	hjstate->js.ps.plan = (Plan *) node;
	hjstate->js.ps.state = estate;
355

356 357
	/*
	 * Miscellaneous initialization
358
	 *
359
	 * create expression context for node
360
	 */
361 362 363 364 365 366
	ExecAssignExprContext(estate, &hjstate->js.ps);

	/*
	 * initialize child expressions
	 */
	hjstate->js.ps.targetlist = (List *)
367
		ExecInitExpr((Expr *) node->join.plan.targetlist,
368 369
					 (PlanState *) hjstate);
	hjstate->js.ps.qual = (List *)
370
		ExecInitExpr((Expr *) node->join.plan.qual,
371 372 373
					 (PlanState *) hjstate);
	hjstate->js.jointype = node->join.jointype;
	hjstate->js.joinqual = (List *)
374
		ExecInitExpr((Expr *) node->join.joinqual,
375 376
					 (PlanState *) hjstate);
	hjstate->hashclauses = (List *)
377
		ExecInitExpr((Expr *) node->hashclauses,
378
					 (PlanState *) hjstate);
379

380
	/*
381
	 * initialize child nodes
382 383 384 385
	 *
	 * Note: we could suppress the REWIND flag for the inner input, which
	 * would amount to betting that the hash will be a single batch.  Not
	 * clear if this would be a win or not.
386
	 */
387 388
	outerNode = outerPlan(node);
	hashNode = (Hash *) innerPlan(node);
389

390 391
	outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
	innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
392

393
#define HASHJOIN_NSLOTS 3
394 395 396

	/*
	 * tuple table initialization
397
	 */
398
	ExecInitResultTupleSlot(estate, &hjstate->js.ps);
399 400 401 402 403
	hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);

	switch (node->join.jointype)
	{
		case JOIN_INNER:
404
		case JOIN_IN:
405 406 407 408
			break;
		case JOIN_LEFT:
			hjstate->hj_NullInnerTupleSlot =
				ExecInitNullTupleSlot(estate,
B
Bruce Momjian 已提交
409
								 ExecGetResultType(innerPlanState(hjstate)));
410 411
			break;
		default:
412
			elog(ERROR, "unrecognized join type: %d",
413 414 415
				 (int) node->join.jointype);
	}

416
	/*
B
Bruce Momjian 已提交
417 418 419 420 421
	 * now for some voodoo.  our temporary tuple slot is actually the result
	 * tuple slot of the Hash node (which is our inner plan).  we do this
	 * because Hash nodes don't return tuples via ExecProcNode() -- instead
	 * the hash join node uses ExecScanHashBucket() to get at the contents of
	 * the hash table.	-cim 6/9/91
422 423
	 */
	{
424 425
		HashState  *hashstate = (HashState *) innerPlanState(hjstate);
		TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
426 427 428 429

		hjstate->hj_HashTupleSlot = slot;
	}

430 431
	/*
	 * initialize tuple type and projection info
432
	 */
433 434
	ExecAssignResultTypeFromTL(&hjstate->js.ps);
	ExecAssignProjectionInfo(&hjstate->js.ps);
435

436
	ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
437
						  ExecGetResultType(outerPlanState(hjstate)));
438

439 440
	/*
	 * initialize hash-specific info
441
	 */
442
	hjstate->hj_HashTable = NULL;
443
	hjstate->hj_FirstOuterTupleSlot = NULL;
444

445
	hjstate->hj_CurHashValue = 0;
446
	hjstate->hj_CurBucketNo = 0;
447
	hjstate->hj_CurTuple = NULL;
448 449

	/*
B
Bruce Momjian 已提交
450 451 452 453
	 * Deconstruct the hash clauses into outer and inner argument values, so
	 * that we can evaluate those subexpressions separately.  Also make a list
	 * of the hash operator OIDs, in preparation for looking up the hash
	 * functions to use.
454
	 */
455 456
	lclauses = NIL;
	rclauses = NIL;
457
	hoperators = NIL;
458
	foreach(l, hjstate->hashclauses)
459
	{
460
		FuncExprState *fstate = (FuncExprState *) lfirst(l);
461
		OpExpr	   *hclause;
462

463 464
		Assert(IsA(fstate, FuncExprState));
		hclause = (OpExpr *) fstate->xprstate.expr;
465
		Assert(IsA(hclause, OpExpr));
466
		lclauses = lappend(lclauses, linitial(fstate->args));
467
		rclauses = lappend(rclauses, lsecond(fstate->args));
468
		hoperators = lappend_oid(hoperators, hclause->opno);
469
	}
470 471
	hjstate->hj_OuterHashKeys = lclauses;
	hjstate->hj_InnerHashKeys = rclauses;
472
	hjstate->hj_HashOperators = hoperators;
473 474
	/* child Hash node needs to evaluate inner hash keys, too */
	((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
475

476 477
	hjstate->js.ps.ps_OuterTupleSlot = NULL;
	hjstate->js.ps.ps_TupFromTlist = false;
478 479
	hjstate->hj_NeedNewOuter = true;
	hjstate->hj_MatchedOuter = false;
480
	hjstate->hj_OuterNotEmpty = false;
481

482
	return hjstate;
483 484 485
}

int
486
ExecCountSlotsHashJoin(HashJoin *node)
487
{
488
	return ExecCountSlotsNode(outerPlan(node)) +
489 490
		ExecCountSlotsNode(innerPlan(node)) +
		HASHJOIN_NSLOTS;
491 492 493
}

/* ----------------------------------------------------------------
494
 *		ExecEndHashJoin
495
 *
496
 *		clean up routine for HashJoin node
497 498 499
 * ----------------------------------------------------------------
 */
void
500
ExecEndHashJoin(HashJoinState *node)
501
{
502
	/*
503
	 * Free hash table
504
	 */
505
	if (node->hj_HashTable)
506
	{
507 508
		ExecHashTableDestroy(node->hj_HashTable);
		node->hj_HashTable = NULL;
509 510
	}

511
	/*
512
	 * Free the exprcontext
513
	 */
514
	ExecFreeExprContext(&node->js.ps);
515

516 517
	/*
	 * clean out the tuple table
518
	 */
519 520 521
	ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
	ExecClearTuple(node->hj_OuterTupleSlot);
	ExecClearTuple(node->hj_HashTupleSlot);
522

523 524 525 526 527
	/*
	 * clean up subtrees
	 */
	ExecEndNode(outerPlanState(node));
	ExecEndNode(innerPlanState(node));
528 529
}

530 531
/*
 * ExecHashJoinOuterGetTuple
532
 *
533
 *		get the next outer tuple for hashjoin: either by
534 535 536 537 538 539
 *		executing a plan node in the first pass, or from
 *		the temp files for the hashjoin batches.
 *
 * Returns a null slot if no more outer tuples.  On success, the tuple's
 * hash value is stored at *hashvalue --- this is either originally computed,
 * or re-read from the temp file.
540 541
 */
static TupleTableSlot *
542 543 544
ExecHashJoinOuterGetTuple(PlanState *outerNode,
						  HashJoinState *hjstate,
						  uint32 *hashvalue)
545
{
B
Bruce Momjian 已提交
546 547
	HashJoinTable hashtable = hjstate->hj_HashTable;
	int			curbatch = hashtable->curbatch;
548 549 550 551
	TupleTableSlot *slot;

	if (curbatch == 0)
	{							/* if it is the first pass */
B
Bruce Momjian 已提交
552

553 554 555 556 557 558 559 560 561
		/*
		 * Check to see if first outer tuple was already fetched by
		 * ExecHashJoin() and not used yet.
		 */
		slot = hjstate->hj_FirstOuterTupleSlot;
		if (!TupIsNull(slot))
			hjstate->hj_FirstOuterTupleSlot = NULL;
		else
			slot = ExecProcNode(outerNode);
B
Bruce Momjian 已提交
562
		if (!TupIsNull(slot))
563 564 565 566 567 568 569 570 571 572
		{
			/*
			 * We have to compute the tuple's hash value.
			 */
			ExprContext *econtext = hjstate->js.ps.ps_ExprContext;

			econtext->ecxt_outertuple = slot;
			*hashvalue = ExecHashGetHashValue(hashtable, econtext,
											  hjstate->hj_OuterHashKeys);

573 574 575
			/* remember outer relation is not empty for possible rescan */
			hjstate->hj_OuterNotEmpty = true;

576
			return slot;
577
		}
B
Bruce Momjian 已提交
578

579
		/*
B
Bruce Momjian 已提交
580 581
		 * We have just reached the end of the first pass. Try to switch to a
		 * saved batch.
582 583
		 */
		curbatch = ExecHashJoinNewBatch(hjstate);
584 585 586
	}

	/*
B
Bruce Momjian 已提交
587 588 589
	 * Try to read from a temp file. Loop allows us to advance to new batches
	 * as needed.  NOTE: nbatch could increase inside ExecHashJoinNewBatch, so
	 * don't try to optimize this loop.
590
	 */
591
	while (curbatch < hashtable->nbatch)
592 593
	{
		slot = ExecHashJoinGetSavedTuple(hjstate,
594 595
										 hashtable->outerBatchFile[curbatch],
										 hashvalue,
596
										 hjstate->hj_OuterTupleSlot);
B
Bruce Momjian 已提交
597
		if (!TupIsNull(slot))
598 599 600 601 602 603
			return slot;
		curbatch = ExecHashJoinNewBatch(hjstate);
	}

	/* Out of batches... */
	return NULL;
604 605
}

606 607
/*
 * ExecHashJoinNewBatch
608
 *		switch to a new hashjoin batch
609 610 611
 *
 * Returns the number of the new batch (1..nbatch-1), or nbatch if no more.
 * We will never return a batch number that has an empty outer batch file.
612
 */
613
static int
614
ExecHashJoinNewBatch(HashJoinState *hjstate)
615
{
616
	HashJoinTable hashtable = hjstate->hj_HashTable;
617 618
	int			nbatch;
	int			curbatch;
B
Bruce Momjian 已提交
619
	BufFile    *innerFile;
620
	TupleTableSlot *slot;
621
	uint32		hashvalue;
622

623 624 625 626 627
start_over:
	nbatch = hashtable->nbatch;
	curbatch = hashtable->curbatch;

	if (curbatch > 0)
628 629
	{
		/*
B
Bruce Momjian 已提交
630 631
		 * We no longer need the previous outer batch file; close it right
		 * away to free disk space.
632
		 */
633 634 635
		if (hashtable->outerBatchFile[curbatch])
			BufFileClose(hashtable->outerBatchFile[curbatch]);
		hashtable->outerBatchFile[curbatch] = NULL;
636 637
	}

638
	/*
639 640 641 642
	 * We can always skip over any batches that are completely empty on both
	 * sides.  We can sometimes skip over batches that are empty on only one
	 * side, but there are exceptions:
	 *
B
Bruce Momjian 已提交
643 644
	 * 1. In a LEFT JOIN, we have to process outer batches even if the inner
	 * batch is empty.
645
	 *
646 647
	 * 2. If we have increased nbatch since the initial estimate, we have to
	 * scan inner batches since they might contain tuples that need to be
B
Bruce Momjian 已提交
648
	 * reassigned to later inner batches.
649
	 *
650 651 652
	 * 3. Similarly, if we have increased nbatch since starting the outer
	 * scan, we have to rescan outer batches in case they contain tuples that
	 * need to be reassigned.
653
	 */
654 655 656 657
	curbatch++;
	while (curbatch < nbatch &&
		   (hashtable->outerBatchFile[curbatch] == NULL ||
			hashtable->innerBatchFile[curbatch] == NULL))
658
	{
659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676
		if (hashtable->outerBatchFile[curbatch] &&
			hjstate->js.jointype == JOIN_LEFT)
			break;				/* must process due to rule 1 */
		if (hashtable->innerBatchFile[curbatch] &&
			nbatch != hashtable->nbatch_original)
			break;				/* must process due to rule 2 */
		if (hashtable->outerBatchFile[curbatch] &&
			nbatch != hashtable->nbatch_outstart)
			break;				/* must process due to rule 3 */
		/* We can ignore this batch. */
		/* Release associated temp files right away. */
		if (hashtable->innerBatchFile[curbatch])
			BufFileClose(hashtable->innerBatchFile[curbatch]);
		hashtable->innerBatchFile[curbatch] = NULL;
		if (hashtable->outerBatchFile[curbatch])
			BufFileClose(hashtable->outerBatchFile[curbatch]);
		hashtable->outerBatchFile[curbatch] = NULL;
		curbatch++;
677
	}
678

679 680
	if (curbatch >= nbatch)
		return curbatch;		/* no more batches */
681

682
	hashtable->curbatch = curbatch;
683 684

	/*
685
	 * Reload the hash table with the new inner batch (which could be empty)
686
	 */
687
	ExecHashTableReset(hashtable);
688

689
	innerFile = hashtable->innerBatchFile[curbatch];
690

691
	if (innerFile != NULL)
692
	{
693 694 695
		if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
			ereport(ERROR,
					(errcode_for_file_access(),
B
Bruce Momjian 已提交
696
				   errmsg("could not rewind hash-join temporary file: %m")));
697 698 699 700 701 702 703

		while ((slot = ExecHashJoinGetSavedTuple(hjstate,
												 innerFile,
												 &hashvalue,
												 hjstate->hj_HashTupleSlot)))
		{
			/*
B
Bruce Momjian 已提交
704 705
			 * NOTE: some tuples may be sent to future batches.  Also, it is
			 * possible for hashtable->nbatch to be increased here!
706
			 */
707
			ExecHashTableInsert(hashtable, slot, hashvalue);
708 709 710 711 712 713 714 715
		}

		/*
		 * after we build the hash table, the inner batch file is no longer
		 * needed
		 */
		BufFileClose(innerFile);
		hashtable->innerBatchFile[curbatch] = NULL;
716 717 718
	}

	/*
719
	 * If there's no outer batch file, advance to next batch.
720
	 */
721 722
	if (hashtable->outerBatchFile[curbatch] == NULL)
		goto start_over;
723

724 725 726 727 728 729 730 731 732
	/*
	 * Rewind outer batch file, so that we can start reading it.
	 */
	if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not rewind hash-join temporary file: %m")));

	return curbatch;
733 734
}

735 736 737
/*
 * ExecHashJoinSaveTuple
 *		save a tuple to a batch file.
738
 *
739
 * The data recorded in the file for each tuple is its hash value,
740
 * then the tuple in MinimalTuple format.
741
 *
742 743 744
 * Note: it is important always to call this in the regular executor
 * context, not in a shorter-lived context; else the temp file buffers
 * will get messed up.
745
 */
746
void
747
ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
748
					  BufFile **fileptr)
749
{
B
Bruce Momjian 已提交
750
	BufFile    *file = *fileptr;
B
Bruce Momjian 已提交
751
	size_t		written;
752

753 754 755 756 757 758 759 760 761 762 763 764 765
	if (file == NULL)
	{
		/* First write to this batch file, so open it. */
		file = BufFileCreateTemp(false);
		*fileptr = file;
	}

	written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
	if (written != sizeof(uint32))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write to hash-join temporary file: %m")));

766 767
	written = BufFileWrite(file, (void *) tuple, tuple->t_len);
	if (written != tuple->t_len)
768 769
		ereport(ERROR,
				(errcode_for_file_access(),
770
				 errmsg("could not write to hash-join temporary file: %m")));
771
}
V
Vadim B. Mikheev 已提交
772

773 774
/*
 * ExecHashJoinGetSavedTuple
B
Bruce Momjian 已提交
775
 *		read the next tuple from a batch file.	Return NULL if no more.
776 777 778 779 780 781 782 783 784 785
 *
 * On success, *hashvalue is set to the tuple's hash value, and the tuple
 * itself is stored in the given slot.
 */
static TupleTableSlot *
ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
						  BufFile *file,
						  uint32 *hashvalue,
						  TupleTableSlot *tupleSlot)
{
786
	uint32		header[2];
787
	size_t		nread;
788
	MinimalTuple tuple;
789

790
	/*
B
Bruce Momjian 已提交
791 792 793
	 * Since both the hash value and the MinimalTuple length word are uint32,
	 * we can read them both in one BufFileRead() call without any type
	 * cheating.
794 795
	 */
	nread = BufFileRead(file, (void *) header, sizeof(header));
B
Bruce Momjian 已提交
796
	if (nread == 0)				/* end of file */
797 798 799 800 801
	{
		ExecClearTuple(tupleSlot);
		return NULL;
	}
	if (nread != sizeof(header))
802 803 804
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not read from hash-join temporary file: %m")));
805 806 807 808 809 810 811
	*hashvalue = header[0];
	tuple = (MinimalTuple) palloc(header[1]);
	tuple->t_len = header[1];
	nread = BufFileRead(file,
						(void *) ((char *) tuple + sizeof(uint32)),
						header[1] - sizeof(uint32));
	if (nread != header[1] - sizeof(uint32))
812 813 814
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not read from hash-join temporary file: %m")));
815
	return ExecStoreMinimalTuple(tuple, tupleSlot, true);
816 817
}

818

V
Vadim B. Mikheev 已提交
819
void
820
ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
V
Vadim B. Mikheev 已提交
821
{
822
	/*
B
Bruce Momjian 已提交
823 824 825 826 827
	 * In a multi-batch join, we currently have to do rescans the hard way,
	 * primarily because batch temp files may have already been released. But
	 * if it's a single-batch join, and there is no parameter change for the
	 * inner subnode, then we can just re-use the existing hash table without
	 * rebuilding it.
V
Vadim B. Mikheev 已提交
828
	 */
829
	if (node->hj_HashTable != NULL)
V
Vadim B. Mikheev 已提交
830
	{
831 832 833
		if (node->hj_HashTable->nbatch == 1 &&
			((PlanState *) node)->righttree->chgParam == NULL)
		{
834 835 836
			/*
			 * okay to reuse the hash table; needn't rescan inner, either.
			 *
B
Bruce Momjian 已提交
837 838
			 * What we do need to do is reset our state about the emptiness of
			 * the outer relation, so that the new scan of the outer will
839 840 841 842 843 844 845 846
			 * update it correctly if it turns out to be empty this time.
			 * (There's no harm in clearing it now because ExecHashJoin won't
			 * need the info.  In the other cases, where the hash table
			 * doesn't exist or we are destroying it, we leave this state
			 * alone because ExecHashJoin will need it the first time
			 * through.)
			 */
			node->hj_OuterNotEmpty = false;
847 848 849 850 851 852
		}
		else
		{
			/* must destroy and rebuild hash table */
			ExecHashTableDestroy(node->hj_HashTable);
			node->hj_HashTable = NULL;
B
Bruce Momjian 已提交
853

854 855 856 857 858 859 860
			/*
			 * if chgParam of subnode is not null then plan will be re-scanned
			 * by first ExecProcNode.
			 */
			if (((PlanState *) node)->righttree->chgParam == NULL)
				ExecReScan(((PlanState *) node)->righttree, exprCtxt);
		}
V
Vadim B. Mikheev 已提交
861
	}
862

863
	/* Always reset intra-tuple state */
864
	node->hj_CurHashValue = 0;
865
	node->hj_CurBucketNo = 0;
866
	node->hj_CurTuple = NULL;
V
Vadim B. Mikheev 已提交
867

868
	node->js.ps.ps_OuterTupleSlot = NULL;
869 870 871
	node->js.ps.ps_TupFromTlist = false;
	node->hj_NeedNewOuter = true;
	node->hj_MatchedOuter = false;
872
	node->hj_FirstOuterTupleSlot = NULL;
873 874

	/*
B
Bruce Momjian 已提交
875 876
	 * if chgParam of subnode is not null then plan will be re-scanned by
	 * first ExecProcNode.
V
Vadim B. Mikheev 已提交
877
	 */
878 879
	if (((PlanState *) node)->lefttree->chgParam == NULL)
		ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
V
Vadim B. Mikheev 已提交
880
}