tuplestore.c 20.9 KB
Newer Older
1 2 3 4 5 6 7 8
/*-------------------------------------------------------------------------
 *
 * tuplestore.c
 *	  Generalized routines for temporary tuple storage.
 *
 * This module handles temporary storage of tuples for purposes such
 * as Materialize nodes, hashjoin batch files, etc.  It is essentially
 * a dumbed-down version of tuplesort.c; it does no sorting of tuples
9 10
 * but can only store and regurgitate a sequence of tuples.  However,
 * because no sort is required, it is allowed to start reading the sequence
B
Bruce Momjian 已提交
11
 * before it has all been written.	This is particularly useful for cursors,
12 13
 * because it allows random access within the already-scanned portion of
 * a query without having to process the underlying scan to completion.
14 15 16 17
 * A temporary file is used to handle the data if it exceeds the
 * space limit specified by the caller.
 *
 * The (approximate) amount of memory allowed to the tuplestore is specified
B
Bruce Momjian 已提交
18
 * in kilobytes by the caller.	We absorb tuples and simply store them in an
19
 * in-memory array as long as we haven't exceeded maxKBytes.  If we do exceed
20
 * maxKBytes, we dump all the tuples into a temp file and then read from that
21
 * when needed.
22 23
 *
 * When the caller requests random access to the data, we write the temp file
24 25 26 27 28 29
 * in a format that allows either forward or backward scan.  Otherwise, only
 * forward scan is allowed.  But rewind and markpos/restorepos are allowed
 * in any case.
 *
 * Because we allow reading before writing is complete, there are two
 * interesting positions in the temp file: the current read position and
B
Bruce Momjian 已提交
30
 * the current write position.	At any given instant, the temp file's seek
31 32
 * position corresponds to one of these, and the other one is remembered in
 * the Tuplestore's state.
33 34
 *
 *
B
Bruce Momjian 已提交
35
 * Portions Copyright (c) 1996-2004, PostgreSQL Global Development Group
36 37 38
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
B
Bruce Momjian 已提交
39
 *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplestore.c,v 1.19 2004/08/29 04:13:00 momjian Exp $
40 41 42 43 44 45 46 47 48 49 50
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "access/heapam.h"
#include "storage/buffile.h"
#include "utils/tuplestore.h"

/*
B
Bruce Momjian 已提交
51
 * Possible states of a Tuplestore object.	These denote the states that
52 53 54 55
 * persist between calls of Tuplestore routines.
 */
typedef enum
{
56 57 58
	TSS_INMEM,					/* Tuples still fit in memory */
	TSS_WRITEFILE,				/* Writing to temp file */
	TSS_READFILE				/* Reading from temp file */
59 60 61 62 63 64 65 66 67
} TupStoreStatus;

/*
 * Private state of a Tuplestore operation.
 */
struct Tuplestorestate
{
	TupStoreStatus status;		/* enumerated value as shown above */
	bool		randomAccess;	/* did caller request random access? */
68
	bool		interXact;		/* keep open through transactions? */
69 70 71 72 73 74 75 76 77
	long		availMem;		/* remaining memory available, in bytes */
	BufFile    *myfile;			/* underlying file, or NULL if none */

	/*
	 * These function pointers decouple the routines that must know what
	 * kind of tuple we are handling from the routines that don't need to
	 * know it. They are set up by the tuplestore_begin_xxx routines.
	 *
	 * (Although tuplestore.c currently only supports heap tuples, I've
B
Bruce Momjian 已提交
78 79
	 * copied this part of tuplesort.c so that extension to other kinds of
	 * objects will be easy if it's ever needed.)
80
	 *
B
Bruce Momjian 已提交
81 82 83
	 * Function to copy a supplied input tuple into palloc'd space. (NB: we
	 * assume that a single pfree() is enough to release the tuple later,
	 * so the representation must be "flat" in one palloc chunk.)
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
	 * state->availMem must be decreased by the amount of space used.
	 */
	void	   *(*copytup) (Tuplestorestate *state, void *tup);

	/*
	 * Function to write a stored tuple onto tape.	The representation of
	 * the tuple on tape need not be the same as it is in memory;
	 * requirements on the tape representation are given below.  After
	 * writing the tuple, pfree() it, and increase state->availMem by the
	 * amount of memory space thereby released.
	 */
	void		(*writetup) (Tuplestorestate *state, void *tup);

	/*
	 * Function to read a stored tuple from tape back into memory. 'len'
	 * is the already-read length of the stored tuple.	Create and return
	 * a palloc'd copy, and decrease state->availMem by the amount of
	 * memory space consumed.
	 */
	void	   *(*readtup) (Tuplestorestate *state, unsigned int len);

	/*
	 * This array holds pointers to tuples in memory if we are in state
107
	 * INMEM.	In states WRITEFILE and READFILE it's not used.
108 109 110 111 112 113
	 */
	void	  **memtuples;		/* array of pointers to palloc'd tuples */
	int			memtupcount;	/* number of tuples currently present */
	int			memtupsize;		/* allocated length of memtuples array */

	/*
114 115 116
	 * These variables are used to keep track of the current position.
	 *
	 * In state WRITEFILE, the current file seek position is the write point,
B
Bruce Momjian 已提交
117 118 119 120 121
	 * and the read position is remembered in readpos_xxx; in state
	 * READFILE, the current file seek position is the read point, and the
	 * write position is remembered in writepos_xxx.  (The write position
	 * is the same as EOF, but since BufFileSeek doesn't currently
	 * implement SEEK_END, we have to remember it explicitly.)
122 123 124 125 126
	 *
	 * Special case: if we are in WRITEFILE state and eof_reached is true,
	 * then the read position is implicitly equal to the write position
	 * (and hence to the file seek position); this way we need not update
	 * the readpos_xxx variables on each write.
127
	 */
128 129 130 131 132
	bool		eof_reached;	/* read reached EOF (always valid) */
	int			current;		/* next array index (valid if INMEM) */
	int			readpos_file;	/* file# (valid if WRITEFILE and not eof) */
	long		readpos_offset; /* offset (valid if WRITEFILE and not eof) */
	int			writepos_file;	/* file# (valid if READFILE) */
B
Bruce Momjian 已提交
133
	long		writepos_offset;	/* offset (valid if READFILE) */
134 135

	/* markpos_xxx holds marked position for mark and restore */
B
Bruce Momjian 已提交
136
	int			markpos_current;	/* saved "current" */
137 138
	int			markpos_file;	/* saved "readpos_file" */
	long		markpos_offset; /* saved "readpos_offset" */
139 140 141
};

#define COPYTUP(state,tup)	((*(state)->copytup) (state, tup))
B
Bruce Momjian 已提交
142
#define WRITETUP(state,tup) ((*(state)->writetup) (state, tup))
143 144 145 146 147 148 149 150 151 152
#define READTUP(state,len)	((*(state)->readtup) (state, len))
#define LACKMEM(state)		((state)->availMem < 0)
#define USEMEM(state,amt)	((state)->availMem -= (amt))
#define FREEMEM(state,amt)	((state)->availMem += (amt))

/*--------------------
 *
 * NOTES about on-tape representation of tuples:
 *
 * We require the first "unsigned int" of a stored tuple to be the total size
153 154
 * on-tape of the tuple, including itself (so it is never zero).
 * The remainder of the stored tuple
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
 * may or may not match the in-memory representation of the tuple ---
 * any conversion needed is the job of the writetup and readtup routines.
 *
 * If state->randomAccess is true, then the stored representation of the
 * tuple must be followed by another "unsigned int" that is a copy of the
 * length --- so the total tape space used is actually sizeof(unsigned int)
 * more than the stored length value.  This allows read-backwards.	When
 * randomAccess is not true, the write/read routines may omit the extra
 * length word.
 *
 * writetup is expected to write both length words as well as the tuple
 * data.  When readtup is called, the tape is positioned just after the
 * front length word; readtup must read the tuple data and advance past
 * the back length word (if present).
 *
 * The write/read routines can make use of the tuple description data
B
Bruce Momjian 已提交
171
 * stored in the Tuplestorestate record, if needed. They are also expected
172 173
 * to adjust state->availMem by the amount of memory space (not tape space!)
 * released or consumed.  There is no error return from either writetup
174
 * or readtup; they should ereport() on failure.
175 176 177 178
 *
 *
 * NOTES about memory consumption calculations:
 *
179 180 181
 * We count space allocated for tuples against the maxKBytes limit,
 * plus the space used by the variable-size array memtuples.
 * Fixed-size space (primarily the BufFile I/O buffer) is not counted.
182
 *
183 184 185 186 187
 * Note that we count actual space used (as shown by GetMemoryChunkSpace)
 * rather than the originally-requested size.  This is important since
 * palloc can add substantial overhead.  It's not a complete answer since
 * we won't count any wasted space in palloc allocation blocks, but it's
 * a lot better than what we were doing before 7.3.
188 189 190 191 192 193
 *
 *--------------------
 */


static Tuplestorestate *tuplestore_begin_common(bool randomAccess,
B
Bruce Momjian 已提交
194 195
						bool interXact,
						int maxKBytes);
196 197 198 199 200 201 202 203 204 205 206 207 208
static void dumptuples(Tuplestorestate *state);
static unsigned int getlen(Tuplestorestate *state, bool eofOK);
static void *copytup_heap(Tuplestorestate *state, void *tup);
static void writetup_heap(Tuplestorestate *state, void *tup);
static void *readtup_heap(Tuplestorestate *state, unsigned int len);


/*
 *		tuplestore_begin_xxx
 *
 * Initialize for a tuple store operation.
 */
static Tuplestorestate *
209
tuplestore_begin_common(bool randomAccess, bool interXact, int maxKBytes)
210 211 212
{
	Tuplestorestate *state;

213
	state = (Tuplestorestate *) palloc0(sizeof(Tuplestorestate));
214

215
	state->status = TSS_INMEM;
216
	state->randomAccess = randomAccess;
217
	state->interXact = interXact;
218 219 220 221
	state->availMem = maxKBytes * 1024L;
	state->myfile = NULL;

	state->memtupcount = 0;
222
	state->memtupsize = 1024;					/* initial guess */
223 224
	state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *));

225 226
	USEMEM(state, GetMemoryChunkSpace(state->memtuples));

227 228 229
	state->eof_reached = false;
	state->current = 0;

230 231 232
	return state;
}

233 234 235 236 237 238 239 240 241 242
/*
 * tuplestore_begin_heap
 *
 * Create a new tuplestore; other types of tuple stores (other than
 * "heap" tuple stores, for heap tuples) are possible, but not presently
 * implemented.
 *
 * randomAccess: if true, both forward and backward accesses to the
 * tuple store are allowed.
 *
243
 * interXact: if true, the files used for on-disk storage persist beyond the
B
Bruce Momjian 已提交
244
 * end of the current transaction.	NOTE: It's the caller's responsibility to
245 246 247
 * create such a tuplestore in a memory context that will also survive
 * transaction boundaries, and to ensure the tuplestore is closed when it's
 * no longer wanted.
248 249
 *
 * maxKBytes: how much data to store in memory (any data beyond this
250
 * amount is paged to disk).  When in doubt, use work_mem.
251
 */
252
Tuplestorestate *
253
tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes)
254
{
255
	Tuplestorestate *state = tuplestore_begin_common(randomAccess,
256 257
													 interXact,
													 maxKBytes);
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286

	state->copytup = copytup_heap;
	state->writetup = writetup_heap;
	state->readtup = readtup_heap;

	return state;
}

/*
 * tuplestore_end
 *
 *	Release resources and clean up.
 */
void
tuplestore_end(Tuplestorestate *state)
{
	int			i;

	if (state->myfile)
		BufFileClose(state->myfile);
	if (state->memtuples)
	{
		for (i = 0; i < state->memtupcount; i++)
			pfree(state->memtuples[i]);
		pfree(state->memtuples);
	}
}

/*
287 288 289 290 291 292 293 294 295 296 297 298
 * tuplestore_ateof
 *
 * Returns the current eof_reached state.
 */
bool
tuplestore_ateof(Tuplestorestate *state)
{
	return state->eof_reached;
}

/*
 * Accept one tuple and append it to the tuplestore.
299 300
 *
 * Note that the input tuple is always copied; the caller need not save it.
301 302 303 304
 *
 * If the read status is currently "AT EOF" then it remains so (the read
 * pointer advances along with the write pointer); otherwise the read
 * pointer is unchanged.  This is for the convenience of nodeMaterial.c.
305 306 307 308 309
 */
void
tuplestore_puttuple(Tuplestorestate *state, void *tuple)
{
	/*
B
Bruce Momjian 已提交
310
	 * Copy the tuple.	(Must do this even in WRITEFILE case.)
311 312 313 314 315
	 */
	tuple = COPYTUP(state, tuple);

	switch (state->status)
	{
316 317
		case TSS_INMEM:
			/* Grow the array as needed */
318 319
			if (state->memtupcount >= state->memtupsize)
			{
320
				FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
321 322 323 324
				state->memtupsize *= 2;
				state->memtuples = (void **)
					repalloc(state->memtuples,
							 state->memtupsize * sizeof(void *));
325
				USEMEM(state, GetMemoryChunkSpace(state->memtuples));
326
			}
327 328

			/* Stash the tuple in the in-memory array */
329 330
			state->memtuples[state->memtupcount++] = tuple;

331 332 333 334
			/* If eof_reached, keep read position in sync */
			if (state->eof_reached)
				state->current = state->memtupcount;

335 336 337 338 339 340 341 342 343
			/*
			 * Done if we still fit in available memory.
			 */
			if (!LACKMEM(state))
				return;

			/*
			 * Nope; time to switch to tape-based operation.
			 */
344
			state->myfile = BufFileCreateTemp(state->interXact);
345 346 347 348 349 350
			state->status = TSS_WRITEFILE;
			dumptuples(state);
			break;
		case TSS_WRITEFILE:
			WRITETUP(state, tuple);
			break;
351
		case TSS_READFILE:
B
Bruce Momjian 已提交
352

353
			/*
354
			 * Switch from reading to writing.
355
			 */
356 357 358 359 360 361
			if (!state->eof_reached)
				BufFileTell(state->myfile,
							&state->readpos_file, &state->readpos_offset);
			if (BufFileSeek(state->myfile,
							state->writepos_file, state->writepos_offset,
							SEEK_SET) != 0)
362
				elog(ERROR, "seek to EOF failed");
363 364
			state->status = TSS_WRITEFILE;
			WRITETUP(state, tuple);
365 366
			break;
		default:
367
			elog(ERROR, "invalid tuplestore state");
368 369 370 371 372 373 374 375 376 377 378
			break;
	}
}

/*
 * Fetch the next tuple in either forward or back direction.
 * Returns NULL if no more tuples.	If should_free is set, the
 * caller must pfree the returned tuple when done with it.
 */
void *
tuplestore_gettuple(Tuplestorestate *state, bool forward,
B
Bruce Momjian 已提交
379
					bool *should_free)
380 381 382 383
{
	unsigned int tuplen;
	void	   *tup;

384 385
	Assert(forward || state->randomAccess);

386 387
	switch (state->status)
	{
388
		case TSS_INMEM:
389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417
			*should_free = false;
			if (forward)
			{
				if (state->current < state->memtupcount)
					return state->memtuples[state->current++];
				state->eof_reached = true;
				return NULL;
			}
			else
			{
				if (state->current <= 0)
					return NULL;

				/*
				 * if all tuples are fetched already then we return last
				 * tuple, else - tuple before last returned.
				 */
				if (state->eof_reached)
					state->eof_reached = false;
				else
				{
					state->current--;	/* last returned tuple */
					if (state->current <= 0)
						return NULL;
				}
				return state->memtuples[state->current - 1];
			}
			break;

418 419 420 421
		case TSS_WRITEFILE:
			/* Skip state change if we'll just return NULL */
			if (state->eof_reached && forward)
				return NULL;
B
Bruce Momjian 已提交
422

423 424 425 426 427 428 429
			/*
			 * Switch from writing to reading.
			 */
			BufFileTell(state->myfile,
						&state->writepos_file, &state->writepos_offset);
			if (!state->eof_reached)
				if (BufFileSeek(state->myfile,
B
Bruce Momjian 已提交
430
							  state->readpos_file, state->readpos_offset,
431
								SEEK_SET) != 0)
432
					elog(ERROR, "seek failed");
433 434 435
			state->status = TSS_READFILE;
			/* FALL THRU into READFILE case */

436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
		case TSS_READFILE:
			*should_free = true;
			if (forward)
			{
				if ((tuplen = getlen(state, true)) != 0)
				{
					tup = READTUP(state, tuplen);
					return tup;
				}
				else
				{
					state->eof_reached = true;
					return NULL;
				}
			}

			/*
			 * Backward.
			 *
			 * if all tuples are fetched already then we return last tuple,
			 * else - tuple before last returned.
457
			 *
B
Bruce Momjian 已提交
458 459
			 * Back up to fetch previously-returned tuple's ending length
			 * word.  If seek fails, assume we are at start of file.
460
			 */
461 462 463 464 465
			if (BufFileSeek(state->myfile, 0, -(long) sizeof(unsigned int),
							SEEK_CUR) != 0)
				return NULL;
			tuplen = getlen(state, false);

466 467 468
			if (state->eof_reached)
			{
				state->eof_reached = false;
469
				/* We will return the tuple returned before returning NULL */
470 471 472 473 474 475 476
			}
			else
			{
				/*
				 * Back up to get ending length word of tuple before it.
				 */
				if (BufFileSeek(state->myfile, 0,
B
Bruce Momjian 已提交
477
							 -(long) (tuplen + 2 * sizeof(unsigned int)),
478 479 480 481 482 483 484 485 486
								SEEK_CUR) != 0)
				{
					/*
					 * If that fails, presumably the prev tuple is the
					 * first in the file.  Back up so that it becomes next
					 * to read in forward direction (not obviously right,
					 * but that is what in-memory case does).
					 */
					if (BufFileSeek(state->myfile, 0,
B
Bruce Momjian 已提交
487
								 -(long) (tuplen + sizeof(unsigned int)),
488
									SEEK_CUR) != 0)
489
						elog(ERROR, "bogus tuple length in backward scan");
490 491
					return NULL;
				}
492
				tuplen = getlen(state, false);
493 494 495 496 497 498 499 500
			}

			/*
			 * Now we have the length of the prior tuple, back up and read
			 * it. Note: READTUP expects we are positioned after the
			 * initial length word of the tuple, so back up to that point.
			 */
			if (BufFileSeek(state->myfile, 0,
B
Bruce Momjian 已提交
501
							-(long) tuplen,
502
							SEEK_CUR) != 0)
503
				elog(ERROR, "bogus tuple length in backward scan");
504 505 506 507
			tup = READTUP(state, tuplen);
			return tup;

		default:
508
			elog(ERROR, "invalid tuplestore state");
509 510 511 512 513 514
			return NULL;		/* keep compiler quiet */
	}
}

/*
 * dumptuples - remove tuples from memory and write to tape
515 516 517 518
 *
 * As a side effect, we must set readpos and markpos to the value
 * corresponding to "current"; otherwise, a dump would lose the current read
 * position.
519 520 521 522 523 524
 */
static void
dumptuples(Tuplestorestate *state)
{
	int			i;

B
Bruce Momjian 已提交
525
	for (i = 0;; i++)
526 527 528 529 530 531 532 533 534
	{
		if (i == state->current)
			BufFileTell(state->myfile,
						&state->readpos_file, &state->readpos_offset);
		if (i == state->markpos_current)
			BufFileTell(state->myfile,
						&state->markpos_file, &state->markpos_offset);
		if (i >= state->memtupcount)
			break;
535
		WRITETUP(state, state->memtuples[i]);
536
	}
537 538 539 540 541 542 543 544 545 546 547
	state->memtupcount = 0;
}

/*
 * tuplestore_rescan		- rewind and replay the scan
 */
void
tuplestore_rescan(Tuplestorestate *state)
{
	switch (state->status)
	{
548 549
		case TSS_INMEM:
			state->eof_reached = false;
550
			state->current = 0;
551 552
			break;
		case TSS_WRITEFILE:
553
			state->eof_reached = false;
554 555
			state->readpos_file = 0;
			state->readpos_offset = 0L;
556 557
			break;
		case TSS_READFILE:
558
			state->eof_reached = false;
559
			if (BufFileSeek(state->myfile, 0, 0L, SEEK_SET) != 0)
560
				elog(ERROR, "seek to start failed");
561 562
			break;
		default:
563
			elog(ERROR, "invalid tuplestore state");
564 565 566 567 568 569 570 571 572 573 574 575
			break;
	}
}

/*
 * tuplestore_markpos	- saves current position in the tuple sequence
 */
void
tuplestore_markpos(Tuplestorestate *state)
{
	switch (state->status)
	{
576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591
		case TSS_INMEM:
			state->markpos_current = state->current;
			break;
		case TSS_WRITEFILE:
			if (state->eof_reached)
			{
				/* Need to record the implicit read position */
				BufFileTell(state->myfile,
							&state->markpos_file,
							&state->markpos_offset);
			}
			else
			{
				state->markpos_file = state->readpos_file;
				state->markpos_offset = state->readpos_offset;
			}
592 593 594 595 596 597 598
			break;
		case TSS_READFILE:
			BufFileTell(state->myfile,
						&state->markpos_file,
						&state->markpos_offset);
			break;
		default:
599
			elog(ERROR, "invalid tuplestore state");
600 601 602 603 604 605 606 607 608 609 610 611 612
			break;
	}
}

/*
 * tuplestore_restorepos - restores current position in tuple sequence to
 *						  last saved position
 */
void
tuplestore_restorepos(Tuplestorestate *state)
{
	switch (state->status)
	{
613 614 615 616 617 618 619 620
		case TSS_INMEM:
			state->eof_reached = false;
			state->current = state->markpos_current;
			break;
		case TSS_WRITEFILE:
			state->eof_reached = false;
			state->readpos_file = state->markpos_file;
			state->readpos_offset = state->markpos_offset;
621 622
			break;
		case TSS_READFILE:
623
			state->eof_reached = false;
624 625 626 627 628 629 630
			if (BufFileSeek(state->myfile,
							state->markpos_file,
							state->markpos_offset,
							SEEK_SET) != 0)
				elog(ERROR, "tuplestore_restorepos failed");
			break;
		default:
631
			elog(ERROR, "invalid tuplestore state");
632 633 634 635 636 637 638 639 640 641 642 643 644
			break;
	}
}


/*
 * Tape interface routines
 */

static unsigned int
getlen(Tuplestorestate *state, bool eofOK)
{
	unsigned int len;
645
	size_t		nbytes;
646

647 648 649 650
	nbytes = BufFileRead(state->myfile, (void *) &len, sizeof(len));
	if (nbytes == sizeof(len))
		return len;
	if (nbytes != 0)
651
		elog(ERROR, "unexpected end of tape");
652
	if (!eofOK)
653
		elog(ERROR, "unexpected end of data");
654
	return 0;
655 656 657 658 659 660 661 662 663 664 665 666
}


/*
 * Routines specialized for HeapTuple case
 */

static void *
copytup_heap(Tuplestorestate *state, void *tup)
{
	HeapTuple	tuple = (HeapTuple) tup;

667 668 669
	tuple = heap_copytuple(tuple);
	USEMEM(state, GetMemoryChunkSpace(tuple));
	return (void *) tuple;
670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
}

/*
 * We don't bother to write the HeapTupleData part of the tuple.
 */

static void
writetup_heap(Tuplestorestate *state, void *tup)
{
	HeapTuple	tuple = (HeapTuple) tup;
	unsigned int tuplen;

	tuplen = tuple->t_len + sizeof(tuplen);
	if (BufFileWrite(state->myfile, (void *) &tuplen,
					 sizeof(tuplen)) != sizeof(tuplen))
685
		elog(ERROR, "write failed");
686 687
	if (BufFileWrite(state->myfile, (void *) tuple->t_data,
					 tuple->t_len) != (size_t) tuple->t_len)
688
		elog(ERROR, "write failed");
689 690 691
	if (state->randomAccess)	/* need trailing length word? */
		if (BufFileWrite(state->myfile, (void *) &tuplen,
						 sizeof(tuplen)) != sizeof(tuplen))
692
			elog(ERROR, "write failed");
693

694
	FREEMEM(state, GetMemoryChunkSpace(tuple));
695 696 697 698 699 700 701 702 703
	heap_freetuple(tuple);
}

static void *
readtup_heap(Tuplestorestate *state, unsigned int len)
{
	unsigned int tuplen = len - sizeof(unsigned int) + HEAPTUPLESIZE;
	HeapTuple	tuple = (HeapTuple) palloc(tuplen);

704
	USEMEM(state, GetMemoryChunkSpace(tuple));
705 706 707 708 709 710 711 712
	/* reconstruct the HeapTupleData portion */
	tuple->t_len = len - sizeof(unsigned int);
	ItemPointerSetInvalid(&(tuple->t_self));
	tuple->t_datamcxt = CurrentMemoryContext;
	tuple->t_data = (HeapTupleHeader) (((char *) tuple) + HEAPTUPLESIZE);
	/* read in the tuple proper */
	if (BufFileRead(state->myfile, (void *) tuple->t_data,
					tuple->t_len) != (size_t) tuple->t_len)
713
		elog(ERROR, "unexpected end of data");
714 715 716
	if (state->randomAccess)	/* need trailing length word? */
		if (BufFileRead(state->myfile, (void *) &tuplen,
						sizeof(tuplen)) != sizeof(tuplen))
717
			elog(ERROR, "unexpected end of data");
718 719
	return (void *) tuple;
}