nbtsort.c 14.2 KB
Newer Older
1
/*-------------------------------------------------------------------------
2 3
 * nbtsort.c
 *		Build a btree from sorted input by loading leaf pages sequentially.
4 5 6
 *
 * NOTES
 *
7 8
 * We use tuplesort.c to sort the given index tuples into order.
 * Then we scan the index tuples in order and build the btree pages
9 10 11 12 13 14
 * for each level.  We load source tuples into leaf-level pages.
 * Whenever we fill a page at one level, we add a link to it to its
 * parent level (starting a new parent level if necessary).  When
 * done, we write out each final page on each level, adding it to
 * its parent level.  When we have only one page on a level, it must be
 * the root -- it can be attached to the btree metapage and we are done.
15 16 17 18 19
 *
 * this code is moderately slow (~10% slower) compared to the regular
 * btree (insertion) build code on sorted or well-clustered data.  on
 * random data, however, the insertion build code is unusable -- the
 * difference on a 60MB heap is a factor of 15 because the random
20 21 22 23
 * probes into the btree thrash the buffer pool.  (NOTE: the above
 * "10%" estimate is probably obsolete, since it refers to an old and
 * not very good external sort implementation that used to exist in
 * this module.  tuplesort.c is almost certainly faster.)
24 25 26 27 28 29
 *
 * this code currently packs the pages to 100% of capacity.  this is
 * not wise, since *any* insertion will cause splitting.  filling to
 * something like the standard 70% steady-state load factor for btrees
 * would probably be better.
 *
30 31 32 33 34 35 36 37
 * Another limitation is that we currently load full copies of all keys
 * into upper tree levels.  The leftmost data key in each non-leaf node
 * could be omitted as far as normal btree operations are concerned
 * (see README for more info).  However, because we build the tree from
 * the bottom up, we need that data key to insert into the node's parent.
 * This could be fixed by keeping a spare copy of the minimum key in the
 * state stack, but I haven't time for that right now.
 *
38
 *
B
Add:  
Bruce Momjian 已提交
39 40
 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
 * Portions Copyright (c) 1994, Regents of the University of California
41 42
 *
 * IDENTIFICATION
43
 *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.55 2000/07/21 06:42:33 tgl Exp $
44
 *
45 46 47
 *-------------------------------------------------------------------------
 */

48
#include "postgres.h"
49

50
#include "access/nbtree.h"
51
#include "utils/tuplesort.h"
52

53

54 55 56 57 58 59 60
/*
 * turn on debugging output.
 *
 * XXX this code just does a numeric printf of the index key, so it's
 * only really useful for integer keys.
 */
/*#define FASTBUILD_DEBUG*/
61 62

/*
63
 * Status record for spooling.
64
 */
65
struct BTSpool
66
{
67 68
	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
	Relation	index;
69
	bool		isunique;
70
};
71

72 73 74 75 76 77 78 79 80 81 82 83 84 85
/*
 * Status record for a btree page being built.  We have one of these
 * for each active tree level.
 */
typedef struct BTPageState
{
	Buffer		btps_buf;		/* current buffer & page */
	Page		btps_page;
	OffsetNumber btps_lastoff;	/* last item offset loaded */
	int			btps_level;
	struct BTPageState *btps_next; /* link to parent level, if any */
} BTPageState;


86 87 88 89 90 91
#define BTITEMSZ(btitem) \
	((btitem) ? \
	 (IndexTupleDSize((btitem)->bti_itup) + \
	  (sizeof(BTItemData) - sizeof(IndexTupleData))) : \
	 0)

92

93
static void _bt_load(Relation index, BTSpool *btspool);
94 95
static void _bt_buildadd(Relation index, BTPageState *state,
						 BTItem bti, int flags);
96
static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
97 98
static BTPageState *_bt_pagestate(Relation index, int flags, int level);
static void _bt_uppershutdown(Relation index, BTPageState *state);
99

100 101

/*
102
 * Interface routines
103 104 105 106
 */


/*
107
 * create and initialize a spool structure
108
 */
109
BTSpool    *
110
_bt_spoolinit(Relation index, bool isunique)
111
{
112
	BTSpool    *btspool = (BTSpool *) palloc(sizeof(BTSpool));
113

B
Bruce Momjian 已提交
114
	MemSet((char *) btspool, 0, sizeof(BTSpool));
115

116 117
	btspool->index = index;
	btspool->isunique = isunique;
118

119
	btspool->sortstate = tuplesort_begin_index(index, isunique, false);
120

121
	/*
122 123 124
	 * Currently, tuplesort provides sort functions on IndexTuples. If we
	 * kept anything in a BTItem other than a regular IndexTuple, we'd
	 * need to modify tuplesort to understand BTItems as such.
125 126
	 */
	Assert(sizeof(BTItemData) == sizeof(IndexTupleData));
127

128
	return btspool;
129 130 131 132 133 134
}

/*
 * clean up a spool structure and its substructures.
 */
void
135
_bt_spooldestroy(BTSpool *btspool)
136
{
137
	tuplesort_end(btspool->sortstate);
138
	pfree((void *) btspool);
139 140 141
}

/*
142
 * spool a btitem into the sort file.
143
 */
144 145
void
_bt_spool(BTItem btitem, BTSpool *btspool)
146
{
147 148
	/* A BTItem is really just an IndexTuple */
	tuplesort_puttuple(btspool->sortstate, (void *) btitem);
149 150 151
}

/*
152 153
 * given a spool loaded by successive calls to _bt_spool,
 * create an entire btree.
154
 */
155 156
void
_bt_leafbuild(BTSpool *btspool)
157
{
158
#ifdef BTREE_BUILD_STATS
159
	if (Show_btree_build_stats)
160
	{
161
		fprintf(StatFp, "BTREE BUILD (Spool) STATISTICS\n");
162 163
		ShowUsage();
		ResetUsage();
164
	}
165
#endif /* BTREE_BUILD_STATS */
166 167 168
	tuplesort_performsort(btspool->sortstate);

	_bt_load(btspool->index, btspool);
169 170 171 172
}


/*
173
 * Internal routines.
174
 */
175

176 177 178 179 180

/*
 * allocate a new, clean btree page, not linked to any siblings.
 */
static void
181
_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
182
{
183
	BTPageOpaque opaque;
184

185 186 187 188 189 190
	*buf = _bt_getbuf(index, P_NEW, BT_WRITE);
	*page = BufferGetPage(*buf);
	_bt_pageinit(*page, BufferGetPageSize(*buf));
	opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
	opaque->btpo_prev = opaque->btpo_next = P_NONE;
	opaque->btpo_flags = flags;
191 192 193 194
}

/*
 * slide an array of ItemIds back one slot (from P_FIRSTKEY to
195 196 197
 * P_HIKEY, overwriting P_HIKEY).  we need to do this when we discover
 * that we have built an ItemId array in what has turned out to be a
 * P_RIGHTMOST page.
198 199 200 201
 */
static void
_bt_slideleft(Relation index, Buffer buf, Page page)
{
202 203 204 205
	OffsetNumber off;
	OffsetNumber maxoff;
	ItemId		previi;
	ItemId		thisii;
206 207 208 209 210 211 212 213 214 215 216 217

	if (!PageIsEmpty(page))
	{
		maxoff = PageGetMaxOffsetNumber(page);
		previi = PageGetItemId(page, P_HIKEY);
		for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
		{
			thisii = PageGetItemId(page, off);
			*previi = *thisii;
			previi = thisii;
		}
		((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
218
	}
219 220
}

221 222 223 224
/*
 * allocate and initialize a new BTPageState.  the returned structure
 * is suitable for immediate use by _bt_buildadd.
 */
225
static BTPageState *
226
_bt_pagestate(Relation index, int flags, int level)
227
{
228
	BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState));
229

B
Bruce Momjian 已提交
230
	MemSet((char *) state, 0, sizeof(BTPageState));
231 232 233 234 235
	_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags);
	state->btps_lastoff = P_HIKEY;
	state->btps_next = (BTPageState *) NULL;
	state->btps_level = level;

236
	return state;
237 238 239 240 241 242 243 244
}

/*
 * return a copy of the minimum (P_HIKEY or P_FIRSTKEY) item on
 * 'opage'.  the copy is modified to point to 'opage' (as opposed to
 * the page to which the item used to point, e.g., a heap page if
 * 'opage' is a leaf page).
 */
245
static BTItem
246 247
_bt_minitem(Page opage, BlockNumber oblkno, int atend)
{
248 249 250
	OffsetNumber off;
	BTItem		obti;
	BTItem		nbti;
251

252 253 254 255
	off = atend ? P_HIKEY : P_FIRSTKEY;
	obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off));
	nbti = _bt_formitem(&(obti->bti_itup));
	ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY);
256

257
	return nbti;
258 259
}

260
/*
261
 * add an item to a disk page from the sort output.
262 263 264 265
 *
 * we must be careful to observe the following restrictions, placed
 * upon us by the conventions in nbtsearch.c:
 * - rightmost pages start data items at P_HIKEY instead of at
266
 *	 P_FIRSTKEY.
267 268 269 270
 *
 * a leaf page being built looks like:
 *
 * +----------------+---------------------------------+
271
 * | PageHeaderData | linp0 linp1 linp2 ...			  |
272
 * +-----------+----+---------------------------------+
273
 * | ... linpN |									  |
274
 * +-----------+--------------------------------------+
275 276
 * |	 ^ last										  |
 * |												  |
277
 * +-------------+------------------------------------+
278
 * |			 | itemN ...						  |
279
 * +-------------+------------------+-----------------+
280
 * |		  ... item3 item2 item1 | "special space" |
281 282 283 284 285 286 287 288
 * +--------------------------------+-----------------+
 *
 * contrast this with the diagram in bufpage.h; note the mismatch
 * between linps and items.  this is because we reserve linp0 as a
 * placeholder for the pointer to the "high key" item; when we have
 * filled up the page, we will set linp0 to point to itemN and clear
 * linpN.
 *
289
 * 'last' pointer indicates the last offset added to the page.
290
 */
291 292
static void
_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
293
{
294 295 296 297 298
	Buffer		nbuf;
	Page		npage;
	OffsetNumber last_off;
	Size		pgspc;
	Size		btisz;
299 300 301 302 303 304 305

	nbuf = state->btps_buf;
	npage = state->btps_page;
	last_off = state->btps_lastoff;

	pgspc = PageGetFreeSpace(npage);
	btisz = BTITEMSZ(bti);
306
	btisz = MAXALIGN(btisz);
307 308

	/*
309 310 311 312 313
	 * Check whether the item can fit on a btree page at all. (Eventually,
	 * we ought to try to apply TOAST methods if not.) We actually need to
	 * be able to fit three items on every page, so restrict any one item
	 * to 1/3 the per-page available space. Note that at this point, btisz
	 * doesn't include the ItemId.
314 315
	 *
	 * NOTE: similar code appears in _bt_insertonpg() to defend against
316 317
	 * oversize items being inserted into an already-existing index. But
	 * during creation of an index, we don't go through there.
318
	 */
319
	if (btisz > (PageGetPageSize(npage) - sizeof(PageHeaderData) - MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData))
320
		elog(ERROR, "btree: index item size %d exceeds maximum %ld",
321
			 btisz,
322
			 (PageGetPageSize(npage) - sizeof(PageHeaderData) - MAXALIGN(sizeof(BTPageOpaqueData))) /3 - sizeof(ItemIdData));
323

324 325
	if (pgspc < btisz)
	{
326 327 328 329
		/*
		 * Item won't fit on this page, so finish off the page and
		 * write it out.
		 */
330 331 332 333
		Buffer		obuf = nbuf;
		Page		opage = npage;
		ItemId		ii;
		ItemId		hii;
334
		BTItem		nbti;
335

336 337 338
		_bt_blnewpage(index, &nbuf, &npage, flags);

		/*
339 340 341
		 * We copy the last item on the page into the new page, and then
		 * rearrange the old page so that the 'last item' becomes its high
		 * key rather than a true data item.
342
		 *
343 344
		 * note that since we always copy an item to the new page,
		 * 'bti' will never be the first data item on the new page.
345
		 */
346 347 348 349 350
		ii = PageGetItemId(opage, last_off);
		if (PageAddItem(npage, PageGetItem(opage, ii), ii->lp_len,
						P_FIRSTKEY, LP_USED) == InvalidOffsetNumber)
			elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)");
#ifdef FASTBUILD_DEBUG
351
		{
352 353 354 355 356 357 358 359
			bool		isnull;
			BTItem		tmpbti =
				(BTItem) PageGetItem(npage, PageGetItemId(npage, P_FIRSTKEY));
			Datum		d = index_getattr(&(tmpbti->bti_itup), 1,
										  index->rd_att, &isnull);

			printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
				   d, P_FIRSTKEY, state->btps_level);
360
		}
361
#endif
362

363
		/*
364
		 * Move 'last' into the high key position on opage
365 366 367 368 369
		 */
		hii = PageGetItemId(opage, P_HIKEY);
		*hii = *ii;
		ii->lp_flags &= ~LP_USED;
		((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
370

371 372 373
		/*
		 * Reset last_off to point to new page
		 */
374
		last_off = PageGetMaxOffsetNumber(npage);
375

376 377 378 379
		/*
		 * set the page (side link) pointers.
		 */
		{
380 381
			BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
			BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
382 383 384 385 386 387 388

			oopaque->btpo_next = BufferGetBlockNumber(nbuf);
			nopaque->btpo_prev = BufferGetBlockNumber(obuf);
			nopaque->btpo_next = P_NONE;
		}

		/*
389 390 391
		 * Link the old buffer into its parent, using its minimum key.
		 * If we don't have a parent, we have to create one;
		 * this adds a new btree level.
392
		 */
393
		if (state->btps_next == (BTPageState *) NULL)
394
		{
395 396
			state->btps_next =
				_bt_pagestate(index, 0, state->btps_level + 1);
397
		}
398 399 400
		nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0);
		_bt_buildadd(index, state->btps_next, nbti, 0);
		pfree((void *) nbti);
401 402 403 404 405 406 407

		/*
		 * write out the old stuff.  we never want to see it again, so we
		 * can give up our lock (if we had one; BuildingBtree is set, so
		 * we aren't locking).
		 */
		_bt_wrtbuf(index, obuf);
408 409
	}

410
	/*
411
	 * Add the new item into the current page.
412
	 */
413 414 415
	last_off = OffsetNumberNext(last_off);
	if (PageAddItem(npage, (Item) bti, btisz,
					last_off, LP_USED) == InvalidOffsetNumber)
416
		elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)");
417
#ifdef FASTBUILD_DEBUG
418
	{
419 420
		bool		isnull;
		Datum		d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull);
421 422

		printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n",
423
			   d, last_off, state->btps_level);
424
	}
425
#endif
426 427 428 429

	state->btps_buf = nbuf;
	state->btps_page = npage;
	state->btps_lastoff = last_off;
430 431
}

432 433 434
/*
 * Finish writing out the completed btree.
 */
435
static void
436
_bt_uppershutdown(Relation index, BTPageState *state)
437
{
438 439 440 441
	BTPageState *s;
	BlockNumber blkno;
	BTPageOpaque opaque;
	BTItem		bti;
442

443 444 445
	/*
	 * Each iteration of this loop completes one more level of the tree.
	 */
446 447 448 449
	for (s = state; s != (BTPageState *) NULL; s = s->btps_next)
	{
		blkno = BufferGetBlockNumber(s->btps_buf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
450

451
		/*
452 453 454 455 456 457
		 * We have to link the last page on this level to somewhere.
		 *
		 * If we're at the top, it's the root, so attach it to the metapage.
		 * Otherwise, add an entry for it to its parent using its minimum
		 * key.  This may cause the last page of the parent level to split,
		 * but that's not a problem -- we haven't gotten to it yet.
458
		 */
459
		if (s->btps_next == (BTPageState *) NULL)
460
		{
461 462 463 464 465 466 467 468
			opaque->btpo_flags |= BTP_ROOT;
			_bt_metaproot(index, blkno, s->btps_level + 1);
		}
		else
		{
			bti = _bt_minitem(s->btps_page, blkno, 0);
			_bt_buildadd(index, s->btps_next, bti, 0);
			pfree((void *) bti);
469
		}
470

471
		/*
472 473
		 * This is the rightmost page, so the ItemId array needs to be
		 * slid back one slot.  Then we can dump out the page.
474 475 476 477
		 */
		_bt_slideleft(index, s->btps_buf, s->btps_page);
		_bt_wrtbuf(index, s->btps_buf);
	}
478 479 480
}

/*
481 482
 * Read tuples in correct sort order from tuplesort, and load them into
 * btree leaves.
483
 */
484
static void
485
_bt_load(Relation index, BTSpool *btspool)
486
{
487
	BTPageState *state = NULL;
488

489 490
	for (;;)
	{
491 492 493
		BTItem		bti;
		bool		should_free;

494 495 496 497
		bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true,
											   &should_free);
		if (bti == (BTItem) NULL)
			break;
498 499 500 501 502 503

		/* When we see first tuple, create first index page */
		if (state == NULL)
			state = _bt_pagestate(index, BTP_LEAF, 0);

		_bt_buildadd(index, state, bti, BTP_LEAF);
504 505 506
		if (should_free)
			pfree((void *) bti);
	}
507

508 509
	if (state != NULL)
		_bt_uppershutdown(index, state);
510
}