nbtree.h 25.1 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * nbtree.h
4
 *	  header file for postgres btree access method implementation.
5 6
 *
 *
7
 * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * src/include/access/nbtree.h
11 12 13
 *
 *-------------------------------------------------------------------------
 */
14 15
#ifndef NBTREE_H
#define NBTREE_H
16

17
#include "access/genam.h"
18
#include "access/itup.h"
B
Bruce Momjian 已提交
19
#include "access/sdir.h"
20
#include "access/xlog.h"
21
#include "access/xlogutils.h"
22

23 24 25 26

/* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
typedef uint16 BTCycleId;

27
/*
28
 *	BTPageOpaqueData -- At the end of every page, we store a pointer
29
 *	to both siblings in the tree.  This is used to do forward/backward
30 31 32
 *	index scans.  The next-page link is also critical for recovery when
 *	a search has navigated to the wrong page due to concurrent page splits
 *	or deletions; see src/backend/access/nbtree/README for more info.
33
 *
B
Bruce Momjian 已提交
34
 *	In addition, we store the page's btree level (counting upwards from
35 36 37 38
 *	zero at a leaf page) as well as some flag bits indicating the page type
 *	and status.  If the page is deleted, we replace the level with the
 *	next-transaction-ID value indicating when it is safe to reclaim the page.
 *
B
Bruce Momjian 已提交
39
 *	We also store a "vacuum cycle ID".	When a page is split while VACUUM is
40
 *	processing the index, a nonzero value associated with the VACUUM run is
B
Bruce Momjian 已提交
41 42
 *	stored into both halves of the split page.	(If VACUUM is not running,
 *	both pages receive zero cycleids.)	This allows VACUUM to detect whether
43
 *	a page was split since it started, with a small probability of false match
44 45
 *	if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs
 *	ago.  Also, during a split, the BTP_SPLIT_END flag is cleared in the left
46 47 48
 *	(original) page, and set in the right page, but only if the next page
 *	to its right has a different cycleid.
 *
49 50
 *	NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
 *	instead.
51 52
 */

53 54
typedef struct BTPageOpaqueData
{
55 56 57 58
	BlockNumber btpo_prev;		/* left sibling, or P_NONE if leftmost */
	BlockNumber btpo_next;		/* right sibling, or P_NONE if rightmost */
	union
	{
B
Bruce Momjian 已提交
59
		uint32		level;		/* tree level --- zero for leaf pages */
60
		TransactionId xact;		/* next transaction ID, if deleted */
B
Bruce Momjian 已提交
61
	}			btpo;
62
	uint16		btpo_flags;		/* flag bits, see below */
63
	BTCycleId	btpo_cycleid;	/* vacuum cycle ID of latest split */
B
Bruce Momjian 已提交
64 65 66 67
} BTPageOpaqueData;

typedef BTPageOpaqueData *BTPageOpaque;

68
/* Bits defined in btpo_flags */
69
#define BTP_LEAF		(1 << 0)	/* leaf page, i.e. not internal page */
70
#define BTP_ROOT		(1 << 1)	/* root page (has no parent) */
71
#define BTP_DELETED		(1 << 2)	/* page has been deleted from tree */
72
#define BTP_META		(1 << 3)	/* meta-page */
73
#define BTP_HALF_DEAD	(1 << 4)	/* empty, but still in tree */
74
#define BTP_SPLIT_END	(1 << 5)	/* rightmost page of split group */
75
#define BTP_HAS_GARBAGE (1 << 6)	/* page has LP_DEAD tuples */
76

77
/*
B
Bruce Momjian 已提交
78
 * The max allowed value of a cycle ID is a bit less than 64K.	This is
79 80 81 82 83 84 85
 * for convenience of pg_filedump and similar utilities: we want to use
 * the last 2 bytes of special space as an index type indicator, and
 * restricting cycle ID lets btree use that space for vacuum cycle IDs
 * while still allowing index type to be identified.
 */
#define MAX_BT_CYCLE_ID		0xFF7F

86

87 88 89
/*
 * The Meta page is always the first page in the btree index.
 * Its primary purpose is to point to the location of the btree root page.
90 91
 * We also point to the "fast" root, which is the current effective root;
 * see README for discussion.
92
 */
V
WAL  
Vadim B. Mikheev 已提交
93 94 95

typedef struct BTMetaPageData
{
96 97 98 99 100 101
	uint32		btm_magic;		/* should contain BTREE_MAGIC */
	uint32		btm_version;	/* should contain BTREE_VERSION */
	BlockNumber btm_root;		/* current root location */
	uint32		btm_level;		/* tree level of the root page */
	BlockNumber btm_fastroot;	/* current "fast" root location */
	uint32		btm_fastlevel;	/* tree level of the "fast" root page */
V
WAL  
Vadim B. Mikheev 已提交
102 103 104
} BTMetaPageData;

#define BTPageGetMeta(p) \
105
	((BTMetaPageData *) PageGetContents(p))
V
Vadim B. Mikheev 已提交
106

B
Bruce Momjian 已提交
107
#define BTREE_METAPAGE	0		/* first page is meta */
108
#define BTREE_MAGIC		0x053162	/* magic number of btree pages */
109
#define BTREE_VERSION	2		/* current version number */
B
Bruce Momjian 已提交
110

111
/*
112 113
 * Maximum size of a btree index entry, including its tuple header.
 *
114 115 116 117
 * We actually need to be able to fit three items on every page,
 * so restrict any one item to 1/3 the per-page available space.
 */
#define BTMaxItemSize(page) \
118
	MAXALIGN_DOWN((PageGetPageSize(page) - \
119
				   MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
120
				   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
121

122
/*
123 124 125 126 127
 * The leaf-page fillfactor defaults to 90% but is user-adjustable.
 * For pages above the leaf level, we use a fixed 70% fillfactor.
 * The fillfactor is applied during index build and when splitting
 * a rightmost page; when splitting non-rightmost pages we try to
 * divide the data equally.
128
 */
129
#define BTREE_MIN_FILLFACTOR		10
130
#define BTREE_DEFAULT_FILLFACTOR	90
131
#define BTREE_NONLEAF_FILLFACTOR	70
132

133
/*
134
 *	Test whether two btree entries are "the same".
135 136 137 138 139 140 141 142 143
 *
 *	Old comments:
 *	In addition, we must guarantee that all tuples in the index are unique,
 *	in order to satisfy some assumptions in Lehman and Yao.  The way that we
 *	do this is by generating a new OID for every insertion that we do in the
 *	tree.  This adds eight bytes to the size of btree index tuples.  Note
 *	that we do not use the OID as part of a composite key; the OID only
 *	serves as a unique identifier for a given index tuple (logical position
 *	within a page).
144
 *
145 146
 *	New comments:
 *	actually, we must guarantee that all tuples in A LEVEL
147
 *	are unique, not in ALL INDEX. So, we can use the t_tid
148
 *	as unique identifier for a given index tuple (logical position
149
 *	within a level). - vadim 04/09/97
150
 */
151 152 153 154
#define BTTidSame(i1, i2)	\
	( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \
	  (i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \
	  (i1).ip_posid == (i2).ip_posid )
B
Bruce Momjian 已提交
155
#define BTEntrySame(i1, i2) \
156
	BTTidSame((i1)->t_tid, (i2)->t_tid)
157

158

159
/*
160 161 162
 *	In general, the btree code tries to localize its knowledge about
 *	page layout to a couple of routines.  However, we need a special
 *	value to indicate "no page number" in those places where we expect
163 164
 *	page numbers.  We can use zero for this because we never need to
 *	make a pointer to the metadata page.
165 166
 */

167
#define P_NONE			0
168 169 170 171 172

/*
 * Macros to test whether a page is leftmost or rightmost on its tree level,
 * as well as other state info kept in the opaque data.
 */
173 174
#define P_LEFTMOST(opaque)		((opaque)->btpo_prev == P_NONE)
#define P_RIGHTMOST(opaque)		((opaque)->btpo_next == P_NONE)
175 176
#define P_ISLEAF(opaque)		((opaque)->btpo_flags & BTP_LEAF)
#define P_ISROOT(opaque)		((opaque)->btpo_flags & BTP_ROOT)
177
#define P_ISDELETED(opaque)		((opaque)->btpo_flags & BTP_DELETED)
178
#define P_ISHALFDEAD(opaque)	((opaque)->btpo_flags & BTP_HALF_DEAD)
179
#define P_IGNORE(opaque)		((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
180
#define P_HAS_GARBAGE(opaque)	((opaque)->btpo_flags & BTP_HAS_GARBAGE)
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198

/*
 *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
 *	page.  The high key is not a data key, but gives info about what range of
 *	keys is supposed to be on this page.  The high key on a page is required
 *	to be greater than or equal to any data key that appears on the page.
 *	If we find ourselves trying to insert a key > high key, we know we need
 *	to move right (this should only happen if the page was split since we
 *	examined the parent page).
 *
 *	Our insertion algorithm guarantees that we can use the initial least key
 *	on our right sibling as the high key.  Once a page is created, its high
 *	key changes only if the page is split.
 *
 *	On a non-rightmost page, the high key lives in item 1 and data items
 *	start in item 2.  Rightmost pages have no high key, so we store data
 *	items beginning in item 1.
 */
199

B
Bruce Momjian 已提交
200 201
#define P_HIKEY				((OffsetNumber) 1)
#define P_FIRSTKEY			((OffsetNumber) 2)
B
Bruce Momjian 已提交
202
#define P_FIRSTDATAKEY(opaque)	(P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
203

V
Vadim B. Mikheev 已提交
204
/*
205 206
 * XLOG records for btree operations
 *
V
Vadim B. Mikheev 已提交
207 208 209
 * XLOG allows to store some information in high 4 bits of log
 * record xl_info field
 */
210
#define XLOG_BTREE_INSERT_LEAF	0x00	/* add index tuple without split */
B
Bruce Momjian 已提交
211
#define XLOG_BTREE_INSERT_UPPER 0x10	/* same, on a non-leaf page */
212
#define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
213
#define XLOG_BTREE_SPLIT_L		0x30	/* add index tuple with split */
214
#define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
215
#define XLOG_BTREE_SPLIT_L_ROOT 0x50	/* add tuple with split of root */
B
Bruce Momjian 已提交
216
#define XLOG_BTREE_SPLIT_R_ROOT 0x60	/* as above, new item on right */
217
#define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
218
#define XLOG_BTREE_DELETE_PAGE	0x80	/* delete an entire page */
219
#define XLOG_BTREE_DELETE_PAGE_META 0x90		/* same, and update metapage */
220
#define XLOG_BTREE_NEWROOT		0xA0	/* new root page */
221 222
#define XLOG_BTREE_DELETE_PAGE_HALF 0xB0		/* page deletion that makes
												 * parent half-dead */
B
Bruce Momjian 已提交
223 224 225 226
#define XLOG_BTREE_VACUUM		0xC0	/* delete entries on a page during
										 * vacuum */
#define XLOG_BTREE_REUSE_PAGE	0xD0	/* old page is about to be reused from
										 * FSM */
227

V
Vadim B. Mikheev 已提交
228
/*
229
 * All that we need to find changed index tuple
V
Vadim B. Mikheev 已提交
230 231 232
 */
typedef struct xl_btreetid
{
B
Bruce Momjian 已提交
233 234
	RelFileNode node;
	ItemPointerData tid;		/* changed tuple id */
V
Vadim B. Mikheev 已提交
235 236
} xl_btreetid;

V
Vadim B. Mikheev 已提交
237
/*
238
 * All that we need to regenerate the meta-data page
V
Vadim B. Mikheev 已提交
239
 */
240
typedef struct xl_btree_metadata
V
Vadim B. Mikheev 已提交
241
{
242 243 244 245
	BlockNumber root;
	uint32		level;
	BlockNumber fastroot;
	uint32		fastlevel;
246
} xl_btree_metadata;
V
Vadim B. Mikheev 已提交
247

B
Bruce Momjian 已提交
248
/*
249 250 251 252
 * This is what we need to know about simple (without split) insert.
 *
 * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
 * Note that INSERT_META implies it's not a leaf page.
V
Vadim B. Mikheev 已提交
253
 */
V
Vadim B. Mikheev 已提交
254 255
typedef struct xl_btree_insert
{
B
Bruce Momjian 已提交
256
	xl_btreetid target;			/* inserted tuple id */
257
	/* BlockNumber downlink field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */
258
	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
259
	/* INDEX TUPLE FOLLOWS AT END OF STRUCT */
V
Vadim B. Mikheev 已提交
260 261
} xl_btree_insert;

V
Vadim B. Mikheev 已提交
262
#define SizeOfBtreeInsert	(offsetof(xl_btreetid, tid) + SizeOfIptrData)
V
Vadim B. Mikheev 已提交
263

B
Bruce Momjian 已提交
264
/*
265 266 267
 * On insert with split, we save all the items going into the right sibling
 * so that we can restore it completely from the log record.  This way takes
 * less xlog space than the normal approach, because if we did it standardly,
268
 * XLogInsert would almost always think the right page is new and store its
269 270
 * whole page image.  The left page, however, is handled in the normal
 * incremental-update fashion.
271 272
 *
 * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
273
 * The _L and _R variants indicate whether the inserted tuple went into the
274
 * left or right split page (and thus, whether newitemoff and the new item
B
Bruce Momjian 已提交
275
 * are stored or not).	The _ROOT variants indicate that we are splitting
276
 * the root page, and thus that a newroot record rather than an insert or
B
Bruce Momjian 已提交
277
 * split record should follow.	Note that a split record never carries a
278
 * metapage update --- we'll do that in the parent-level update.
V
Vadim B. Mikheev 已提交
279
 */
V
Vadim B. Mikheev 已提交
280 281
typedef struct xl_btree_split
{
282
	RelFileNode node;
283 284 285 286 287
	BlockNumber leftsib;		/* orig page / new left page */
	BlockNumber rightsib;		/* new right page */
	BlockNumber rnext;			/* next block (orig page's rightlink) */
	uint32		level;			/* tree level of page being split */
	OffsetNumber firstright;	/* first item moved to right page */
288

289
	/*
B
Bruce Momjian 已提交
290 291 292
	 * If level > 0, BlockIdData downlink follows.	(We use BlockIdData rather
	 * than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit
	 * aligned.)
293
	 *
294 295 296
	 * If level > 0, an IndexTuple representing the HIKEY of the left page
	 * follows.  We don't need this on leaf pages, because it's the same as
	 * the leftmost key in the new right page.	Also, it's suppressed if
297 298
	 * XLogInsert chooses to store the left page's whole page image.
	 *
299 300
	 * In the _L variants, next are OffsetNumber newitemoff and the new item.
	 * (In the _R variants, the new item is one of the right page's tuples.)
301 302
	 * The new item, but not newitemoff, is suppressed if XLogInsert chooses
	 * to store the left page's whole page image.
303 304 305
	 *
	 * Last are the right page's tuples in the form used by _bt_restore_page.
	 */
V
Vadim B. Mikheev 已提交
306 307
} xl_btree_split;

308
#define SizeOfBtreeSplit	(offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber))
V
Vadim B. Mikheev 已提交
309

B
Bruce Momjian 已提交
310
/*
311 312
 * This is what we need to know about delete of individual leaf index tuples.
 * The WAL record can represent deletion of any number of index tuples on a
313
 * single index page when *not* executed by VACUUM.
314 315 316
 */
typedef struct xl_btree_delete
{
B
Bruce Momjian 已提交
317
	RelFileNode node;			/* RelFileNode of the index */
318
	BlockNumber block;
B
Bruce Momjian 已提交
319 320
	RelFileNode hnode;			/* RelFileNode of the heap the index currently
								 * points at */
321
	int			nitems;
322

323
	/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
324 325
} xl_btree_delete;

326
#define SizeOfBtreeDelete	(offsetof(xl_btree_delete, nitems) + sizeof(int))
327

328 329 330 331 332 333 334
/*
 * This is what we need to know about page reuse within btree.
 */
typedef struct xl_btree_reuse_page
{
	RelFileNode node;
	BlockNumber block;
B
Bruce Momjian 已提交
335
	TransactionId latestRemovedXid;
336 337 338 339
} xl_btree_reuse_page;

#define SizeOfBtreeReusePage	(sizeof(xl_btree_reuse_page))

340 341 342 343 344 345 346
/*
 * This is what we need to know about vacuum of individual leaf index tuples.
 * The WAL record can represent deletion of any number of index tuples on a
 * single index page when executed by VACUUM.
 *
 * The correctness requirement for applying these changes during recovery is
 * that we must do one of these two things for every block in the index:
B
Bruce Momjian 已提交
347
 *		* lock the block for cleanup and apply any required changes
348 349 350 351 352 353
 *		* EnsureBlockUnpinned()
 * The purpose of this is to ensure that no index scans started before we
 * finish scanning the index are still running by the time we begin to remove
 * heap tuples.
 *
 * Any changes to any one block are registered on just one WAL record. All
354 355 356
 * blocks that we need to run EnsureBlockUnpinned() are listed as a block range
 * starting from the last block vacuumed through until this one. Individual
 * block numbers aren't given.
357 358
 *
 * Note that the *last* WAL record in any vacuum of an index is allowed to
359
 * have a zero length array of offsets. Earlier records must have at least one.
360 361 362 363 364 365 366 367 368 369 370
 */
typedef struct xl_btree_vacuum
{
	RelFileNode node;
	BlockNumber block;
	BlockNumber lastBlockVacuumed;

	/* TARGET OFFSET NUMBERS FOLLOW */
} xl_btree_vacuum;

#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
371 372 373 374

/*
 * This is what we need to know about deletion of a btree page.  The target
 * identifies the tuple removed from the parent page (note that we remove
B
Bruce Momjian 已提交
375
 * this tuple's downlink and the *following* tuple's key).	Note we do not
376
 * store any content for the deleted page --- it is just rewritten as empty
377
 * during recovery, apart from resetting the btpo.xact.
378 379 380 381 382 383 384
 */
typedef struct xl_btree_delete_page
{
	xl_btreetid target;			/* deleted tuple id in parent page */
	BlockNumber deadblk;		/* child block being deleted */
	BlockNumber leftblk;		/* child block's left sibling, if any */
	BlockNumber rightblk;		/* child block's right sibling */
385
	TransactionId btpo_xact;	/* value of btpo.xact for use in recovery */
386
	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
387
} xl_btree_delete_page;
388

389
#define SizeOfBtreeDeletePage	(offsetof(xl_btree_delete_page, btpo_xact) + sizeof(TransactionId))
390 391

/*
392
 * New root log record.  There are zero tuples if this is to establish an
393 394 395 396
 * empty root, or two if it is the result of splitting an old root.
 *
 * Note that although this implies rewriting the metadata page, we don't need
 * an xl_btree_metadata record --- the rootblk and level are sufficient.
V
Vadim B. Mikheev 已提交
397 398 399
 */
typedef struct xl_btree_newroot
{
B
Bruce Momjian 已提交
400
	RelFileNode node;
401 402
	BlockNumber rootblk;		/* location of new root */
	uint32		level;			/* its tree level */
403
	/* 0 or 2 INDEX TUPLES FOLLOW AT END OF STRUCT */
V
Vadim B. Mikheev 已提交
404 405
} xl_btree_newroot;

406 407
#define SizeOfBtreeNewroot	(offsetof(xl_btree_newroot, level) + sizeof(uint32))

V
Vadim B. Mikheev 已提交
408

409
/*
410
 *	Operator strategy numbers for B-tree have been moved to access/skey.h,
411
 *	because many places need to use them in ScanKeyInit() calls.
412 413 414
 *
 *	The strategy numbers are chosen so that we can commute them by
 *	subtraction, thus:
415
 */
B
Bruce Momjian 已提交
416
#define BTCommuteStrategyNumber(strat)	(BTMaxStrategyNumber + 1 - (strat))
417 418

/*
419 420 421 422 423
 *	When a new operator class is declared, we require that the user
 *	supply us with an amproc procedure for determining whether, for
 *	two keys a and b, a < b, a = b, or a > b.  This routine must
 *	return < 0, 0, > 0, respectively, in these three cases.  Since we
 *	only have one such proc in amproc, it's number 1.
424 425 426 427
 */

#define BTORDER_PROC	1

428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
/*
 *	We need to be able to tell the difference between read and write
 *	requests for pages, in order to do locking correctly.
 */

#define BT_READ			BUFFER_LOCK_SHARE
#define BT_WRITE		BUFFER_LOCK_EXCLUSIVE

/*
 *	BTStackData -- As we descend a tree, we push the (location, downlink)
 *	pairs from internal pages onto a private stack.  If we split a
 *	leaf, we use this stack to walk back up the tree and insert data
 *	into parent pages (and possibly to split them, too).  Lehman and
 *	Yao's update algorithm guarantees that under no circumstances can
 *	our private stack give us an irredeemably bad picture up the tree.
 *	Again, see the paper for details.
 */

typedef struct BTStackData
{
	BlockNumber bts_blkno;
	OffsetNumber bts_offset;
450
	IndexTupleData bts_btentry;
451 452 453 454 455 456
	struct BTStackData *bts_parent;
} BTStackData;

typedef BTStackData *BTStack;

/*
457 458 459
 * BTScanOpaqueData is the btree-private state needed for an indexscan.
 * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
 * details of the preprocessing), information about the current location
B
Bruce Momjian 已提交
460
 * of the scan, and information about the marked location, if any.	(We use
461
 * BTScanPosData to represent the data needed for each of current and marked
B
Bruce Momjian 已提交
462
 * locations.)	In addition we can remember some known-killed index entries
463
 * that must be marked before we can move off the current page.
464
 *
465 466 467
 * Index scans work a page at a time: we pin and read-lock the page, identify
 * all the matching items on the page and save them in BTScanPosData, then
 * release the read-lock while returning the items to the caller for
B
Bruce Momjian 已提交
468
 * processing.	This approach minimizes lock/unlock traffic.  Note that we
469
 * keep the pin on the index page until the caller is done with all the items
B
Bruce Momjian 已提交
470
 * (this is needed for VACUUM synchronization, see nbtree/README).	When we
471 472 473 474
 * are ready to step to the next page, if the caller has told us any of the
 * items were killed, we re-lock the page to mark them killed, then unlock.
 * Finally we drop the pin and step to the next page in the appropriate
 * direction.
475 476
 */

477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
typedef struct BTScanPosItem	/* what we remember about each match */
{
	ItemPointerData heapTid;	/* TID of referenced heap item */
	OffsetNumber indexOffset;	/* index item's location within page */
} BTScanPosItem;

typedef struct BTScanPosData
{
	Buffer		buf;			/* if valid, the buffer is pinned */

	BlockNumber nextPage;		/* page's right link when we scanned it */

	/*
	 * moreLeft and moreRight track whether we think there may be matching
	 * index entries to the left and right of the current page, respectively.
	 * We can clear the appropriate one of these flags when _bt_checkkeys()
	 * returns continuescan = false.
	 */
	bool		moreLeft;
	bool		moreRight;

	/*
	 * The items array is always ordered in index order (ie, increasing
	 * indexoffset).  When scanning backwards it is convenient to fill the
	 * array back-to-front, so we start at the last slot and fill downwards.
	 * Hence we need both a first-valid-entry and a last-valid-entry counter.
	 * itemIndex is a cursor showing which entry was last returned to caller.
	 */
	int			firstItem;		/* first valid index in items[] */
	int			lastItem;		/* last valid index in items[] */
	int			itemIndex;		/* current index in items[] */

B
Bruce Momjian 已提交
509
	BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
510 511 512 513 514 515
} BTScanPosData;

typedef BTScanPosData *BTScanPos;

#define BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf)

516 517
typedef struct BTScanOpaqueData
{
518
	/* these fields are set by _bt_preprocess_keys(): */
519
	bool		qual_ok;		/* false if qual can never be satisfied */
520 521
	int			numberOfKeys;	/* number of preprocessed scan keys */
	ScanKey		keyData;		/* array of preprocessed scan keys */
522 523 524 525 526

	/* info about killed items if any (killedItems is NULL if never used) */
	int		   *killedItems;	/* currPos.items indexes of killed items */
	int			numKilled;		/* number of currently stored items */

527
	/*
B
Bruce Momjian 已提交
528 529 530 531 532
	 * If the marked position is on the same page as current position, we
	 * don't use markPos, but just keep the marked itemIndex in markItemIndex
	 * (all the rest of currPos is valid for the mark position). Hence, to
	 * determine if there is a mark, first look at markItemIndex, then at
	 * markPos.
533 534 535
	 */
	int			markItemIndex;	/* itemIndex, or -1 if not valid */

536 537 538
	/* keep these last in struct for efficiency */
	BTScanPosData currPos;		/* current position data */
	BTScanPosData markPos;		/* marked position, if any */
539 540 541 542
} BTScanOpaqueData;

typedef BTScanOpaqueData *BTScanOpaque;

543
/*
544
 * We use some private sk_flags bits in preprocessed scan keys.  We're allowed
B
Bruce Momjian 已提交
545
 * to use bits 16-31 (see skey.h).	The uppermost bits are copied from the
546
 * index's indoption[] array entry for the index attribute.
547
 */
B
Bruce Momjian 已提交
548 549
#define SK_BT_REQFWD	0x00010000		/* required to continue forward scan */
#define SK_BT_REQBKWD	0x00020000		/* required to continue backward scan */
550 551 552
#define SK_BT_INDOPTION_SHIFT  24		/* must clear the above bits */
#define SK_BT_DESC			(INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
#define SK_BT_NULLS_FIRST	(INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
553

554 555 556 557 558 559
/*
 * prototypes for functions in nbtree.c (external entry points for btree)
 */
extern Datum btbuild(PG_FUNCTION_ARGS);
extern Datum btinsert(PG_FUNCTION_ARGS);
extern Datum btbeginscan(PG_FUNCTION_ARGS);
560
extern Datum btgettuple(PG_FUNCTION_ARGS);
561
extern Datum btgetbitmap(PG_FUNCTION_ARGS);
562 563 564 565
extern Datum btrescan(PG_FUNCTION_ARGS);
extern Datum btendscan(PG_FUNCTION_ARGS);
extern Datum btmarkpos(PG_FUNCTION_ARGS);
extern Datum btrestrpos(PG_FUNCTION_ARGS);
566
extern Datum btbulkdelete(PG_FUNCTION_ARGS);
567
extern Datum btvacuumcleanup(PG_FUNCTION_ARGS);
568
extern Datum btoptions(PG_FUNCTION_ARGS);
569

570 571 572
/*
 * prototypes for functions in nbtinsert.c
 */
573 574
extern bool _bt_doinsert(Relation rel, IndexTuple itup,
			 IndexUniqueCheck checkUnique, Relation heapRel);
575
extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
576
extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
B
Bruce Momjian 已提交
577
				  BTStack stack, bool is_root, bool is_only);
578 579 580 581

/*
 * prototypes for functions in nbtpage.c
 */
582
extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
583
extern Buffer _bt_getroot(Relation rel, int access);
584
extern Buffer _bt_gettrueroot(Relation rel);
585
extern void _bt_checkpage(Relation rel, Buffer buf);
586
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
587
extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
B
Bruce Momjian 已提交
588
				 BlockNumber blkno, int access);
589
extern void _bt_relbuf(Relation rel, Buffer buf);
590
extern void _bt_pageinit(Page page, Size size);
591
extern bool _bt_page_recyclable(Page page);
592
extern void _bt_delitems_delete(Relation rel, Buffer buf,
B
Bruce Momjian 已提交
593
					OffsetNumber *itemnos, int nitems, Relation heapRel);
594
extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
B
Bruce Momjian 已提交
595
		   OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed);
B
Bruce Momjian 已提交
596
extern int	_bt_pagedel(Relation rel, Buffer buf, BTStack stack);
597 598 599 600

/*
 * prototypes for functions in nbtsearch.c
 */
601
extern BTStack _bt_search(Relation rel,
B
Bruce Momjian 已提交
602 603
		   int keysz, ScanKey scankey, bool nextkey,
		   Buffer *bufP, int access);
604
extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
605
			  ScanKey scankey, bool nextkey, int access);
606
extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
607
			ScanKey scankey, bool nextkey);
608
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
B
Bruce Momjian 已提交
609
			Page page, OffsetNumber offnum);
610
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
611
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
612
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
613 614 615 616

/*
 * prototypes for functions in nbtutils.c
 */
617
extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup);
618
extern ScanKey _bt_mkscankey_nodata(Relation rel);
619 620
extern void _bt_freeskey(ScanKey skey);
extern void _bt_freestack(BTStack stack);
621
extern void _bt_preprocess_keys(IndexScanDesc scan);
622
extern bool _bt_checkkeys(IndexScanDesc scan,
B
Bruce Momjian 已提交
623 624
			  Page page, OffsetNumber offnum,
			  ScanDirection dir, bool *continuescan);
625
extern void _bt_killitems(IndexScanDesc scan, bool haveLock);
626 627 628
extern BTCycleId _bt_vacuum_cycleid(Relation rel);
extern BTCycleId _bt_start_vacuum(Relation rel);
extern void _bt_end_vacuum(Relation rel);
629
extern void _bt_end_vacuum_callback(int code, Datum arg);
630 631
extern Size BTreeShmemSize(void);
extern void BTreeShmemInit(void);
632 633 634 635

/*
 * prototypes for functions in nbtsort.c
 */
636
typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
637

638
extern BTSpool *_bt_spoolinit(Relation index, bool isunique, bool isdead);
639
extern void _bt_spooldestroy(BTSpool *btspool);
640
extern void _bt_spool(IndexTuple itup, BTSpool *btspool);
641
extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
642

643 644 645 646
/*
 * prototypes for functions in nbtxlog.c
 */
extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
647
extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec);
648 649
extern void btree_xlog_startup(void);
extern void btree_xlog_cleanup(void);
650
extern bool btree_safe_restartpoint(void);
651

652
#endif   /* NBTREE_H */