vacuum.c 101.9 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * vacuum.c
4 5 6
 *	  The postgres vacuum cleaner.
 *
 * This file includes the "full" version of VACUUM, as well as control code
7
 * used by all three of full VACUUM, lazy VACUUM, and ANALYZE.	See
8 9
 * vacuumlazy.c and analyze.c for the rest of the code for the latter two.
 *
10
 *
11
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
12
 * Portions Copyright (c) 1994, Regents of the University of California
13 14 15
 *
 *
 * IDENTIFICATION
16
 *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.349 2007/03/14 18:48:55 tgl Exp $
17 18 19
 *
 *-------------------------------------------------------------------------
 */
20 21
#include "postgres.h"

22
#include <sys/time.h>
23
#include <unistd.h>
24

25
#include "access/clog.h"
B
Bruce Momjian 已提交
26 27
#include "access/genam.h"
#include "access/heapam.h"
28 29
#include "access/transam.h"
#include "access/xact.h"
30
#include "catalog/namespace.h"
31
#include "catalog/pg_database.h"
32
#include "commands/dbcommands.h"
B
Bruce Momjian 已提交
33
#include "commands/vacuum.h"
34
#include "executor/executor.h"
B
Bruce Momjian 已提交
35
#include "miscadmin.h"
36
#include "postmaster/autovacuum.h"
37
#include "storage/freespace.h"
38
#include "storage/proc.h"
39
#include "storage/procarray.h"
40
#include "utils/acl.h"
B
Bruce Momjian 已提交
41
#include "utils/builtins.h"
42
#include "utils/flatfiles.h"
43
#include "utils/fmgroids.h"
B
Bruce Momjian 已提交
44
#include "utils/inval.h"
45
#include "utils/lsyscache.h"
46
#include "utils/memutils.h"
47
#include "utils/pg_rusage.h"
48
#include "utils/relcache.h"
B
Bruce Momjian 已提交
49
#include "utils/syscache.h"
50 51
#include "pgstat.h"

52

53 54 55 56 57
/*
 * GUC parameters
 */
int			vacuum_freeze_min_age;

58 59 60 61
/*
 * VacPage structures keep track of each page on which we find useful
 * amounts of free space.
 */
62 63 64 65 66 67
typedef struct VacPageData
{
	BlockNumber blkno;			/* BlockNumber of this Page */
	Size		free;			/* FreeSpace on this Page */
	uint16		offsets_used;	/* Number of OffNums used by vacuum */
	uint16		offsets_free;	/* Number of OffNums free or to be free */
68
	OffsetNumber offsets[1];	/* Array of free OffNums */
69 70 71 72 73 74
} VacPageData;

typedef VacPageData *VacPage;

typedef struct VacPageListData
{
75
	BlockNumber empty_end_pages;	/* Number of "empty" end-pages */
76
	int			num_pages;		/* Number of pages in pagedesc */
77 78
	int			num_allocated_pages;	/* Number of allocated pages in
										 * pagedesc */
79
	VacPage    *pagedesc;		/* Descriptions of pages */
80 81 82 83
} VacPageListData;

typedef VacPageListData *VacPageList;

84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
/*
 * The "vtlinks" array keeps information about each recently-updated tuple
 * ("recent" meaning its XMAX is too new to let us recycle the tuple).
 * We store the tuple's own TID as well as its t_ctid (its link to the next
 * newer tuple version).  Searching in this array allows us to follow update
 * chains backwards from newer to older tuples.  When we move a member of an
 * update chain, we must move *all* the live members of the chain, so that we
 * can maintain their t_ctid link relationships (we must not just overwrite
 * t_ctid in an existing tuple).
 *
 * Note: because t_ctid links can be stale (this would only occur if a prior
 * VACUUM crashed partway through), it is possible that new_tid points to an
 * empty slot or unrelated tuple.  We have to check the linkage as we follow
 * it, just as is done in EvalPlanQual.
 */
99 100
typedef struct VTupleLinkData
{
101 102
	ItemPointerData new_tid;	/* t_ctid of an updated tuple */
	ItemPointerData this_tid;	/* t_self of the tuple */
103 104 105 106
} VTupleLinkData;

typedef VTupleLinkData *VTupleLink;

107 108 109 110
/*
 * We use an array of VTupleMoveData to plan a chain tuple move fully
 * before we do it.
 */
111 112 113
typedef struct VTupleMoveData
{
	ItemPointerData tid;		/* tuple ID */
114 115
	VacPage		vacpage;		/* where to move it to */
	bool		cleanVpd;		/* clean vacpage before using? */
116 117 118 119
} VTupleMoveData;

typedef VTupleMoveData *VTupleMove;

120 121 122
/*
 * VRelStats contains the data acquired by scan_heap for use later
 */
123 124
typedef struct VRelStats
{
125
	/* miscellaneous statistics */
126
	BlockNumber rel_pages;
127
	double		rel_tuples;
128 129 130
	Size		min_tlen;
	Size		max_tlen;
	bool		hasindex;
131
	/* vtlinks array for tuple chain following - sorted by new_tid */
132 133 134 135
	int			num_vtlinks;
	VTupleLink	vtlinks;
} VRelStats;

B
Bruce Momjian 已提交
136 137 138 139 140 141
/*----------------------------------------------------------------------
 * ExecContext:
 *
 * As these variables always appear together, we put them into one struct
 * and pull initialization and cleanup into separate routines.
 * ExecContext is used by repair_frag() and move_xxx_tuple().  More
B
Bruce Momjian 已提交
142
 * accurately:	It is *used* only in move_xxx_tuple(), but because this
B
Bruce Momjian 已提交
143 144 145 146 147 148 149 150 151
 * routine is called many times, we initialize the struct just once in
 * repair_frag() and pass it on to move_xxx_tuple().
 */
typedef struct ExecContextData
{
	ResultRelInfo *resultRelInfo;
	EState	   *estate;
	TupleTableSlot *slot;
} ExecContextData;
152

B
Bruce Momjian 已提交
153 154 155 156 157 158 159 160 161 162 163 164 165 166
typedef ExecContextData *ExecContext;

static void
ExecContext_Init(ExecContext ec, Relation rel)
{
	TupleDesc	tupdesc = RelationGetDescr(rel);

	/*
	 * We need a ResultRelInfo and an EState so we can use the regular
	 * executor's index-entry-making machinery.
	 */
	ec->estate = CreateExecutorState();

	ec->resultRelInfo = makeNode(ResultRelInfo);
B
Bruce Momjian 已提交
167
	ec->resultRelInfo->ri_RangeTableIndex = 1;	/* dummy */
B
Bruce Momjian 已提交
168
	ec->resultRelInfo->ri_RelationDesc = rel;
B
Bruce Momjian 已提交
169
	ec->resultRelInfo->ri_TrigDesc = NULL;		/* we don't fire triggers */
B
Bruce Momjian 已提交
170 171 172 173 174 175 176

	ExecOpenIndices(ec->resultRelInfo);

	ec->estate->es_result_relations = ec->resultRelInfo;
	ec->estate->es_num_result_relations = 1;
	ec->estate->es_result_relation_info = ec->resultRelInfo;

177 178
	/* Set up a tuple slot too */
	ec->slot = MakeSingleTupleTableSlot(tupdesc);
B
Bruce Momjian 已提交
179 180 181 182 183
}

static void
ExecContext_Finish(ExecContext ec)
{
184
	ExecDropSingleTupleTableSlot(ec->slot);
B
Bruce Momjian 已提交
185 186 187
	ExecCloseIndices(ec->resultRelInfo);
	FreeExecutorState(ec->estate);
}
B
Bruce Momjian 已提交
188

B
Bruce Momjian 已提交
189 190 191 192
/*
 * End of ExecContext Implementation
 *----------------------------------------------------------------------
 */
193

194
static MemoryContext vac_context = NULL;
195

B
Bruce Momjian 已提交
196
static int	elevel = -1;
V
Vadim B. Mikheev 已提交
197

198 199 200
static TransactionId OldestXmin;
static TransactionId FreezeLimit;

201

202
/* non-export function prototypes */
203
static List *get_rel_oids(List *relids, const RangeVar *vacrel,
B
Bruce Momjian 已提交
204
			 const char *stmttype);
205
static void vac_truncate_clog(TransactionId frozenXID);
206
static void vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind);
207
static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
208
static void scan_heap(VRelStats *vacrelstats, Relation onerel,
209
		  VacPageList vacuum_pages, VacPageList fraged_pages);
210
static void repair_frag(VRelStats *vacrelstats, Relation onerel,
211
			VacPageList vacuum_pages, VacPageList fraged_pages,
212
			int nindexes, Relation *Irel);
B
Bruce Momjian 已提交
213
static void move_chain_tuple(Relation rel,
B
Bruce Momjian 已提交
214 215 216
				 Buffer old_buf, Page old_page, HeapTuple old_tup,
				 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
				 ExecContext ec, ItemPointer ctid, bool cleanVpd);
B
Bruce Momjian 已提交
217
static void move_plain_tuple(Relation rel,
B
Bruce Momjian 已提交
218 219 220
				 Buffer old_buf, Page old_page, HeapTuple old_tup,
				 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
				 ExecContext ec);
B
Bruce Momjian 已提交
221
static void update_hint_bits(Relation rel, VacPageList fraged_pages,
B
Bruce Momjian 已提交
222 223
				 int num_fraged_pages, BlockNumber last_move_dest_block,
				 int num_moved);
224
static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
225
			VacPageList vacpagelist);
226
static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
227
static void vacuum_index(VacPageList vacpagelist, Relation indrel,
228
			 double num_tuples, int keep_tuples);
229
static void scan_index(Relation indrel, double num_tuples);
230
static bool tid_reaped(ItemPointer itemptr, void *state);
231
static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
232
			   BlockNumber rel_pages);
233
static VacPage copy_vac_page(VacPage vacpage);
B
Bruce Momjian 已提交
234
static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
235
static void *vac_bsearch(const void *key, const void *base,
236 237
			size_t nelem, size_t size,
			int (*compar) (const void *, const void *));
B
Bruce Momjian 已提交
238 239 240
static int	vac_cmp_blk(const void *left, const void *right);
static int	vac_cmp_offno(const void *left, const void *right);
static int	vac_cmp_vtlinks(const void *left, const void *right);
B
Bruce Momjian 已提交
241
static bool enough_space(VacPage vacpage, Size len);
B
Bruce Momjian 已提交
242
static Size PageGetFreeSpaceWithFillFactor(Relation relation, Page page);
243 244 245 246 247 248 249 250


/****************************************************************************
 *																			*
 *			Code common to all flavors of VACUUM and ANALYZE				*
 *																			*
 ****************************************************************************
 */
251

252

253 254
/*
 * Primary entry point for VACUUM and ANALYZE commands.
255 256 257 258 259
 *
 * relids is normally NIL; if it is not, then it provides the list of
 * relation OIDs to be processed, and vacstmt->relation is ignored.
 * (The non-NIL case is currently only used by autovacuum.)
 *
260 261
 * isTopLevel should be passed down from ProcessUtility.
 *
262 263
 * It is the caller's responsibility that both vacstmt and relids
 * (if given) be allocated in a memory context that won't disappear
264
 * at transaction commit.
265
 */
266
void
267
vacuum(VacuumStmt *vacstmt, List *relids, bool isTopLevel)
268
{
269
	const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
270 271
	volatile MemoryContext anl_context = NULL;
	volatile bool all_rels,
272 273
				in_outer_xact,
				use_own_xacts;
274
	List	   *relations;
275 276 277 278

	if (vacstmt->verbose)
		elevel = INFO;
	else
279
		elevel = DEBUG2;
280

281
	/*
B
Bruce Momjian 已提交
282 283 284 285 286 287 288 289 290 291
	 * We cannot run VACUUM inside a user transaction block; if we were inside
	 * a transaction, then our commit- and start-transaction-command calls
	 * would not have the intended effect! Furthermore, the forced commit that
	 * occurs before truncating the relation's file would have the effect of
	 * committing the rest of the user's transaction too, which would
	 * certainly not be the desired behavior.  (This only applies to VACUUM
	 * FULL, though.  We could in theory run lazy VACUUM inside a transaction
	 * block, but we choose to disallow that case because we'd rather commit
	 * as soon as possible after finishing the vacuum.	This is mainly so that
	 * we can let go the AccessExclusiveLock that we may be holding.)
292 293
	 *
	 * ANALYZE (without VACUUM) can run either way.
294
	 */
295
	if (vacstmt->vacuum)
296
	{
297
		PreventTransactionChain(isTopLevel, stmttype);
298 299 300
		in_outer_xact = false;
	}
	else
301
		in_outer_xact = IsInTransactionChain(isTopLevel);
302

303
	/*
B
Bruce Momjian 已提交
304 305
	 * Send info about dead objects to the statistics collector, unless we are
	 * in autovacuum --- autovacuum.c does this for itself.
306
	 */
307
	if (vacstmt->vacuum && !IsAutoVacuumWorkerProcess())
308
		pgstat_vacuum_tabstat();
309

310 311 312
	/*
	 * Create special memory context for cross-transaction storage.
	 *
313 314
	 * Since it is a child of PortalContext, it will go away eventually even
	 * if we suffer an error; there's no need for special abort cleanup logic.
315
	 */
316
	vac_context = AllocSetContextCreate(PortalContext,
317 318 319 320
										"Vacuum",
										ALLOCSET_DEFAULT_MINSIZE,
										ALLOCSET_DEFAULT_INITSIZE,
										ALLOCSET_DEFAULT_MAXSIZE);
321

322 323
	/* Remember whether we are processing everything in the DB */
	all_rels = (relids == NIL && vacstmt->relation == NULL);
T
ARGH!  
Tom Lane 已提交
324

325
	/*
B
Bruce Momjian 已提交
326 327
	 * Build list of relations to process, unless caller gave us one. (If we
	 * build one, we put it in vac_context for safekeeping.)
328 329
	 */
	relations = get_rel_oids(relids, vacstmt->relation, stmttype);
330

331 332 333
	/*
	 * Decide whether we need to start/commit our own transactions.
	 *
334 335
	 * For VACUUM (with or without ANALYZE): always do so, so that we can
	 * release locks as soon as possible.  (We could possibly use the outer
B
Bruce Momjian 已提交
336 337
	 * transaction for a one-table VACUUM, but handling TOAST tables would be
	 * problematic.)
338 339
	 *
	 * For ANALYZE (no VACUUM): if inside a transaction block, we cannot
B
Bruce Momjian 已提交
340 341 342
	 * start/commit our own transactions.  Also, there's no need to do so if
	 * only processing one relation.  For multiple relations when not within a
	 * transaction block, use own transactions so we can release locks sooner.
343 344 345 346 347 348 349 350
	 */
	if (vacstmt->vacuum)
		use_own_xacts = true;
	else
	{
		Assert(vacstmt->analyze);
		if (in_outer_xact)
			use_own_xacts = false;
351
		else if (list_length(relations) > 1)
352 353 354 355 356
			use_own_xacts = true;
		else
			use_own_xacts = false;
	}

357
	/*
B
Bruce Momjian 已提交
358 359
	 * If we are running ANALYZE without per-table transactions, we'll need a
	 * memory context with table lifetime.
360
	 */
361 362 363 364 365 366
	if (!use_own_xacts)
		anl_context = AllocSetContextCreate(PortalContext,
											"Analyze",
											ALLOCSET_DEFAULT_MINSIZE,
											ALLOCSET_DEFAULT_INITSIZE,
											ALLOCSET_DEFAULT_MAXSIZE);
367 368

	/*
B
Bruce Momjian 已提交
369 370 371 372 373 374
	 * vacuum_rel expects to be entered with no transaction active; it will
	 * start and commit its own transaction.  But we are called by an SQL
	 * command, and so we are executing inside a transaction already. We
	 * commit the transaction started in PostgresMain() here, and start
	 * another one before exiting to match the commit waiting for us back in
	 * PostgresMain().
375
	 */
376
	if (use_own_xacts)
377 378
	{
		/* matches the StartTransaction in PostgresMain() */
379
		CommitTransactionCommand();
380
	}
381

382 383
	/* Turn vacuum cost accounting on or off */
	PG_TRY();
384
	{
385
		ListCell   *cur;
386

387
		VacuumCostActive = (VacuumCostDelay > 0);
388 389
		VacuumCostBalance = 0;

390 391 392
		/*
		 * Loop to process each selected relation.
		 */
393
		foreach(cur, relations)
394
		{
395 396 397
			Oid			relid = lfirst_oid(cur);

			if (vacstmt->vacuum)
398 399
				vacuum_rel(relid, vacstmt, RELKIND_RELATION);

400
			if (vacstmt->analyze)
401
			{
402 403 404
				MemoryContext old_context = NULL;

				/*
B
Bruce Momjian 已提交
405 406 407 408 409
				 * If using separate xacts, start one for analyze. Otherwise,
				 * we can use the outer transaction, but we still need to call
				 * analyze_rel in a memory context that will be cleaned up on
				 * return (else we leak memory while processing multiple
				 * tables).
410 411 412 413
				 */
				if (use_own_xacts)
				{
					StartTransactionCommand();
414 415
					/* functions in indexes may want a snapshot set */
					ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
416 417 418 419 420
				}
				else
					old_context = MemoryContextSwitchTo(anl_context);

				/*
B
Bruce Momjian 已提交
421 422
				 * Tell the buffer replacement strategy that vacuum is causing
				 * the IO
423 424 425 426 427 428 429 430 431 432 433 434 435 436
				 */
				StrategyHintVacuum(true);

				analyze_rel(relid, vacstmt);

				StrategyHintVacuum(false);

				if (use_own_xacts)
					CommitTransactionCommand();
				else
				{
					MemoryContextSwitchTo(old_context);
					MemoryContextResetAndDeleteChildren(anl_context);
				}
437 438
			}
		}
439
	}
440 441 442 443
	PG_CATCH();
	{
		/* Make sure cost accounting is turned off after error */
		VacuumCostActive = false;
444 445
		/* And reset buffer replacement strategy, too */
		StrategyHintVacuum(false);
446 447 448 449 450 451
		PG_RE_THROW();
	}
	PG_END_TRY();

	/* Turn off vacuum cost accounting */
	VacuumCostActive = false;
452

453 454 455
	/*
	 * Finish up processing.
	 */
456
	if (use_own_xacts)
457
	{
458
		/* here, we are not in a transaction */
459

460
		/*
B
Bruce Momjian 已提交
461
		 * This matches the CommitTransaction waiting for us in
462
		 * PostgresMain().
463
		 */
464
		StartTransactionCommand();
B
Bruce Momjian 已提交
465

466
		/*
B
Bruce Momjian 已提交
467 468 469 470 471
		 * Re-establish the transaction snapshot.  This is wasted effort when
		 * we are called as a normal utility command, because the new
		 * transaction will be dropped immediately by PostgresMain(); but it's
		 * necessary if we are called from autovacuum because autovacuum might
		 * continue on to do an ANALYZE-only call.
472 473
		 */
		ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
474
	}
475

476
	if (vacstmt->vacuum && !IsAutoVacuumWorkerProcess())
477
	{
478 479 480 481 482
		/*
		 * Update pg_database.datfrozenxid, and truncate pg_clog if possible.
		 * (autovacuum.c does this for itself.)
		 */
		vac_update_datfrozenxid();
483

484
		/*
B
Bruce Momjian 已提交
485
		 * If it was a database-wide VACUUM, print FSM usage statistics (we
486 487
		 * don't make you be superuser to see these).  We suppress this in
		 * autovacuum, too.
488
		 */
489
		if (all_rels)
490
			PrintFreeSpaceMapStatistics(elevel);
491 492
	}

493 494
	/*
	 * Clean up working storage --- note we must do this after
B
Bruce Momjian 已提交
495 496
	 * StartTransactionCommand, else we might be trying to delete the active
	 * context!
497 498 499
	 */
	MemoryContextDelete(vac_context);
	vac_context = NULL;
500

501
	if (anl_context)
502
		MemoryContextDelete(anl_context);
503 504 505
}

/*
506
 * Build a list of Oids for each relation to be processed
507 508 509
 *
 * The list is built in vac_context so that it will survive across our
 * per-relation transactions.
510
 */
511
static List *
512
get_rel_oids(List *relids, const RangeVar *vacrel, const char *stmttype)
513
{
N
Neil Conway 已提交
514
	List	   *oid_list = NIL;
515 516
	MemoryContext oldcontext;

517 518 519 520
	/* List supplied by VACUUM's caller? */
	if (relids)
		return relids;

521
	if (vacrel)
522
	{
N
Neil Conway 已提交
523
		/* Process a specific relation */
B
Bruce Momjian 已提交
524
		Oid			relid;
525 526 527 528 529

		relid = RangeVarGetRelid(vacrel, false);

		/* Make a relation list entry for this guy */
		oldcontext = MemoryContextSwitchTo(vac_context);
530
		oid_list = lappend_oid(oid_list, relid);
531
		MemoryContextSwitchTo(oldcontext);
532 533 534
	}
	else
	{
535 536 537 538 539
		/* Process all plain relations listed in pg_class */
		Relation	pgclass;
		HeapScanDesc scan;
		HeapTuple	tuple;
		ScanKeyData key;
540

541 542 543 544
		ScanKeyInit(&key,
					Anum_pg_class_relkind,
					BTEqualStrategyNumber, F_CHAREQ,
					CharGetDatum(RELKIND_RELATION));
545

546
		pgclass = heap_open(RelationRelationId, AccessShareLock);
547

548
		scan = heap_beginscan(pgclass, SnapshotNow, 1, &key);
549

550
		while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
551
		{
552 553
			/* Make a relation list entry for this guy */
			oldcontext = MemoryContextSwitchTo(vac_context);
554
			oid_list = lappend_oid(oid_list, HeapTupleGetOid(tuple));
555
			MemoryContextSwitchTo(oldcontext);
556
		}
557

558 559
		heap_endscan(scan);
		heap_close(pgclass, AccessShareLock);
560 561
	}

N
Neil Conway 已提交
562
	return oid_list;
563 564
}

565 566 567 568 569 570 571 572
/*
 * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points
 */
void
vacuum_set_xid_limits(VacuumStmt *vacstmt, bool sharedRel,
					  TransactionId *oldestXmin,
					  TransactionId *freezeLimit)
{
573
	int			freezemin;
574
	TransactionId limit;
575
	TransactionId safeLimit;
576

577
	/*
B
Bruce Momjian 已提交
578
	 * We can always ignore processes running lazy vacuum.	This is because we
579
	 * use these values only for deciding which tuples we must keep in the
580
	 * tables.	Since lazy vacuum doesn't write its XID anywhere, it's
581 582 583 584 585 586
	 * safe to ignore it.  In theory it could be problematic to ignore lazy
	 * vacuums on a full vacuum, but keep in mind that only one vacuum process
	 * can be working on a particular table at any time, and that each vacuum
	 * is always an independent transaction.
	 */
	*oldestXmin = GetOldestXmin(sharedRel, true);
587 588 589

	Assert(TransactionIdIsNormal(*oldestXmin));

590 591 592 593 594 595 596 597 598 599 600
	/*
	 * Determine the minimum freeze age to use: as specified in the vacstmt,
	 * or vacuum_freeze_min_age, but in any case not more than half
	 * autovacuum_freeze_max_age, so that autovacuums to prevent XID
	 * wraparound won't occur too frequently.
	 */
	freezemin = vacstmt->freeze_min_age;
	if (freezemin < 0)
		freezemin = vacuum_freeze_min_age;
	freezemin = Min(freezemin, autovacuum_freeze_max_age / 2);
	Assert(freezemin >= 0);
601

602
	/*
603
	 * Compute the cutoff XID, being careful not to generate a "permanent" XID
604
	 */
605
	limit = *oldestXmin - freezemin;
606 607 608
	if (!TransactionIdIsNormal(limit))
		limit = FirstNormalTransactionId;

609
	/*
610 611 612
	 * If oldestXmin is very far back (in practice, more than
	 * autovacuum_freeze_max_age / 2 XIDs old), complain and force a
	 * minimum freeze age of zero.
613
	 */
614 615 616 617 618
	safeLimit = ReadNewTransactionId() - autovacuum_freeze_max_age;
	if (!TransactionIdIsNormal(safeLimit))
		safeLimit = FirstNormalTransactionId;

	if (TransactionIdPrecedes(limit, safeLimit))
619
	{
620
		ereport(WARNING,
621
				(errmsg("oldest xmin is far in the past"),
622
				 errhint("Close open transactions soon to avoid wraparound problems.")));
623 624 625 626 627 628
		limit = *oldestXmin;
	}

	*freezeLimit = limit;
}

629

630
/*
631
 *	vac_update_relstats() -- update statistics for one relation
632
 *
633 634 635 636 637
 *		Update the whole-relation statistics that are kept in its pg_class
 *		row.  There are additional stats that will be updated if we are
 *		doing ANALYZE, but we always update these stats.  This routine works
 *		for both index and heap relation entries in pg_class.
 *
638 639 640 641
 *		We violate transaction semantics here by overwriting the rel's
 *		existing pg_class tuple with the new values.  This is reasonably
 *		safe since the new values are correct whether or not this transaction
 *		commits.  The reason for this is that if we updated these tuples in
642 643 644 645 646
 *		the usual way, vacuuming pg_class itself wouldn't work very well ---
 *		by the time we got done with a vacuum cycle, most of the tuples in
 *		pg_class would've been obsoleted.  Of course, this only works for
 *		fixed-size never-null columns, but these are.
 *
647 648 649 650 651
 *		Another reason for doing it this way is that when we are in a lazy
 *		VACUUM and have inVacuum set, we mustn't do any updates --- somebody
 *		vacuuming pg_class might think they could delete a tuple marked with
 *		xmin = our xid.
 *
652 653 654 655 656
 *		This routine is shared by full VACUUM, lazy VACUUM, and stand-alone
 *		ANALYZE.
 */
void
vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
657
					bool hasindex, TransactionId frozenxid)
658 659 660 661
{
	Relation	rd;
	HeapTuple	ctup;
	Form_pg_class pgcform;
662
	bool		dirty;
663

664
	rd = heap_open(RelationRelationId, RowExclusiveLock);
665

666 667 668 669
	/* Fetch a copy of the tuple to scribble on */
	ctup = SearchSysCacheCopy(RELOID,
							  ObjectIdGetDatum(relid),
							  0, 0, 0);
670 671 672
	if (!HeapTupleIsValid(ctup))
		elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
			 relid);
673
	pgcform = (Form_pg_class) GETSTRUCT(ctup);
674

675
	/* Apply required updates, if any, to copied tuple */
676

677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692
	dirty = false;
	if (pgcform->relpages != (int32) num_pages)
	{
		pgcform->relpages = (int32) num_pages;
		dirty = true;
	}
	if (pgcform->reltuples != (float4) num_tuples)
	{
		pgcform->reltuples = (float4) num_tuples;
		dirty = true;
	}
	if (pgcform->relhasindex != hasindex)
	{
		pgcform->relhasindex = hasindex;
		dirty = true;
	}
B
Bruce Momjian 已提交
693

694
	/*
695 696
	 * If we have discovered that there are no indexes, then there's no
	 * primary key either.	This could be done more thoroughly...
697 698
	 */
	if (!hasindex)
699 700 701 702 703 704 705
	{
		if (pgcform->relhaspkey)
		{
			pgcform->relhaspkey = false;
			dirty = true;
		}
	}
706 707 708 709 710 711 712

	/*
	 * relfrozenxid should never go backward.  Caller can pass
	 * InvalidTransactionId if it has no new data.
	 */
	if (TransactionIdIsNormal(frozenxid) &&
		TransactionIdPrecedes(pgcform->relfrozenxid, frozenxid))
713
	{
714
		pgcform->relfrozenxid = frozenxid;
715 716
		dirty = true;
	}
717

718
	/*
719 720 721
	 * If anything changed, write out the tuple.  Even if nothing changed,
	 * force relcache invalidation so all backends reset their rd_targblock
	 * --- otherwise it might point to a page we truncated away.
722
	 */
723
	if (dirty)
724
	{
725
		heap_inplace_update(rd, ctup);
726 727 728 729 730 731 732
		/* the above sends a cache inval message */
	}
	else
	{
		/* no need to change tuple, but force relcache inval anyway */
		CacheInvalidateRelcacheByTuple(ctup);
	}
733 734 735 736 737

	heap_close(rd, RowExclusiveLock);
}


738
/*
739
 *	vac_update_datfrozenxid() -- update pg_database.datfrozenxid for our DB
740
 *
741 742 743
 *		Update pg_database's datfrozenxid entry for our database to be the
 *		minimum of the pg_class.relfrozenxid values.  If we are able to
 *		advance pg_database.datfrozenxid, also try to truncate pg_clog.
744
 *
745
 *		We violate transaction semantics here by overwriting the database's
746 747
 *		existing pg_database tuple with the new value.  This is reasonably
 *		safe since the new value is correct whether or not this transaction
748 749
 *		commits.  As with vac_update_relstats, this avoids leaving dead tuples
 *		behind after a VACUUM.
750
 *
751
 *		This routine is shared by full and lazy VACUUM.
752
 */
753 754
void
vac_update_datfrozenxid(void)
755 756 757
{
	HeapTuple	tuple;
	Form_pg_database dbform;
758
	Relation	relation;
B
Bruce Momjian 已提交
759
	SysScanDesc scan;
760
	HeapTuple	classTup;
761
	TransactionId newFrozenXid;
762 763
	bool		dirty = false;

764 765 766 767 768 769 770 771 772 773
	/*
	 * Initialize the "min" calculation with RecentGlobalXmin.  Any
	 * not-yet-committed pg_class entries for new tables must have
	 * relfrozenxid at least this high, because any other open xact must have
	 * RecentXmin >= its PGPROC.xmin >= our RecentGlobalXmin; see
	 * AddNewRelationTuple().  So we cannot produce a wrong minimum by
	 * starting with this.
	 */
	newFrozenXid = RecentGlobalXmin;

B
Bruce Momjian 已提交
774 775 776
	/*
	 * We must seqscan pg_class to find the minimum Xid, because there is no
	 * index that can help us here.
777 778 779 780 781 782 783 784
	 */
	relation = heap_open(RelationRelationId, AccessShareLock);

	scan = systable_beginscan(relation, InvalidOid, false,
							  SnapshotNow, 0, NULL);

	while ((classTup = systable_getnext(scan)) != NULL)
	{
785
		Form_pg_class classForm = (Form_pg_class) GETSTRUCT(classTup);
786 787 788

		/*
		 * Only consider heap and TOAST tables (anything else should have
789
		 * InvalidTransactionId in relfrozenxid anyway.)
790 791 792 793 794
		 */
		if (classForm->relkind != RELKIND_RELATION &&
			classForm->relkind != RELKIND_TOASTVALUE)
			continue;

795
		Assert(TransactionIdIsNormal(classForm->relfrozenxid));
796

797 798
		if (TransactionIdPrecedes(classForm->relfrozenxid, newFrozenXid))
			newFrozenXid = classForm->relfrozenxid;
799
	}
800

801 802 803 804
	/* we're done with pg_class */
	systable_endscan(scan);
	heap_close(relation, AccessShareLock);

805
	Assert(TransactionIdIsNormal(newFrozenXid));
806 807

	/* Now fetch the pg_database tuple we need to update. */
808
	relation = heap_open(DatabaseRelationId, RowExclusiveLock);
809

810 811
	/* Fetch a copy of the tuple to scribble on */
	tuple = SearchSysCacheCopy(DATABASEOID,
812
							   ObjectIdGetDatum(MyDatabaseId),
813
							   0, 0, 0);
814
	if (!HeapTupleIsValid(tuple))
815
		elog(ERROR, "could not find tuple for database %u", MyDatabaseId);
816 817
	dbform = (Form_pg_database) GETSTRUCT(tuple);

818 819 820 821 822
	/*
	 * Don't allow datfrozenxid to go backward (probably can't happen anyway);
	 * and detect the common case where it doesn't go forward either.
	 */
	if (TransactionIdPrecedes(dbform->datfrozenxid, newFrozenXid))
823
	{
824
		dbform->datfrozenxid = newFrozenXid;
825 826
		dirty = true;
	}
827

828 829
	if (dirty)
		heap_inplace_update(relation, tuple);
830

831
	heap_freetuple(tuple);
832
	heap_close(relation, RowExclusiveLock);
833

834 835 836 837 838 839 840 841 842 843
	/*
	 * If we were able to advance datfrozenxid, mark the flat-file copy of
	 * pg_database for update at commit, and see if we can truncate
	 * pg_clog.
	 */
	if (dirty)
	{
		database_file_update_needed();
		vac_truncate_clog(newFrozenXid);
	}
844 845 846 847 848 849
}


/*
 *	vac_truncate_clog() -- attempt to truncate the commit log
 *
850
 *		Scan pg_database to determine the system-wide oldest datfrozenxid,
851
 *		and use it to truncate the transaction commit log (pg_clog).
852
 *		Also update the XID wrap limit info maintained by varsup.c.
853
 *
854 855
 *		The passed XID is simply the one I just wrote into my pg_database
 *		entry.	It's used to initialize the "min" calculation.
856
 *
857 858 859
 *		This routine is shared by full and lazy VACUUM.  Note that it's
 *		only invoked when we've managed to change our DB's datfrozenxid
 *		entry.
860 861
 */
static void
862
vac_truncate_clog(TransactionId frozenXID)
863
{
B
Bruce Momjian 已提交
864
	TransactionId myXID = GetCurrentTransactionId();
865 866 867
	Relation	relation;
	HeapScanDesc scan;
	HeapTuple	tuple;
868
	NameData	oldest_datname;
869
	bool		frozenAlreadyWrapped = false;
870

871
	/* init oldest_datname to sync with my frozenXID */
872
	namestrcpy(&oldest_datname, get_database_name(MyDatabaseId));
873

874
	/*
875 876 877 878 879 880 881 882 883 884
	 * Scan pg_database to compute the minimum datfrozenxid
	 *
	 * Note: we need not worry about a race condition with new entries being
	 * inserted by CREATE DATABASE.  Any such entry will have a copy of some
	 * existing DB's datfrozenxid, and that source DB cannot be ours because
	 * of the interlock against copying a DB containing an active backend.
	 * Hence the new entry will not reduce the minimum.  Also, if two
	 * VACUUMs concurrently modify the datfrozenxid's of different databases,
	 * the worst possible outcome is that pg_clog is not truncated as
	 * aggressively as it could be.
885
	 */
886
	relation = heap_open(DatabaseRelationId, AccessShareLock);
887

888
	scan = heap_beginscan(relation, SnapshotNow, 0, NULL);
889

890
	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
891 892 893
	{
		Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);

894
		Assert(TransactionIdIsNormal(dbform->datfrozenxid));
895

896 897 898
		if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
			frozenAlreadyWrapped = true;
		else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
899
		{
900
			frozenXID = dbform->datfrozenxid;
901
			namecpy(&oldest_datname, &dbform->datname);
902
		}
903 904 905 906 907 908
	}

	heap_endscan(scan);

	heap_close(relation, AccessShareLock);

909
	/*
B
Bruce Momjian 已提交
910
	 * Do not truncate CLOG if we seem to have suffered wraparound already;
911 912 913
	 * the computed minimum XID might be bogus.  This case should now be
	 * impossible due to the defenses in GetNewTransactionId, but we keep the
	 * test anyway.
914
	 */
915
	if (frozenAlreadyWrapped)
916
	{
917 918
		ereport(WARNING,
				(errmsg("some databases have not been vacuumed in over 2 billion transactions"),
919
				 errdetail("You might have already suffered transaction-wraparound data loss.")));
920 921 922
		return;
	}

923 924
	/* Truncate CLOG to the oldest frozenxid */
	TruncateCLOG(frozenXID);
925 926

	/*
927 928
	 * Update the wrap limit for GetNewTransactionId.  Note: this function
	 * will also signal the postmaster for an(other) autovac cycle if needed.
929
	 */
930
	SetTransactionIdLimit(frozenXID, &oldest_datname);
931 932 933
}


934 935 936 937 938 939 940 941 942 943
/****************************************************************************
 *																			*
 *			Code common to both flavors of VACUUM							*
 *																			*
 ****************************************************************************
 */


/*
 *	vacuum_rel() -- vacuum one heap relation
944
 *
945 946 947 948 949
 *		Doing one heap at a time incurs extra overhead, since we need to
 *		check that the heap exists again just before we vacuum it.	The
 *		reason that we do this is so that vacuuming can be spread across
 *		many small transactions.  Otherwise, two-phase locking would require
 *		us to lock the entire database during one pass of the vacuum cleaner.
950 951
 *
 *		At entry and exit, we are not inside a transaction.
952
 */
953
static void
954
vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
955
{
956
	LOCKMODE	lmode;
957
	Relation	onerel;
958
	LockRelId	onerelid;
959
	Oid			toast_relid;
960

961
	/* Begin a transaction for vacuuming this relation */
962
	StartTransactionCommand();
963 964 965 966 967 968 969 970 971

	if (vacstmt->full)
	{
		/* functions in indexes may want a snapshot set */
		ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
	}
	else
	{
		/*
B
Bruce Momjian 已提交
972 973
		 * During a lazy VACUUM we do not run any user-supplied functions, and
		 * so it should be safe to not create a transaction snapshot.
974 975 976 977 978
		 *
		 * We can furthermore set the inVacuum flag, which lets other
		 * concurrent VACUUMs know that they can ignore this one while
		 * determining their OldestXmin.  (The reason we don't set inVacuum
		 * during a full VACUUM is exactly that we may have to run user-
B
Bruce Momjian 已提交
979 980 981 982 983
		 * defined functions for functional indexes, and we want to make sure
		 * that if they use the snapshot set above, any tuples it requires
		 * can't get removed from other tables.  An index function that
		 * depends on the contents of other tables is arguably broken, but we
		 * won't break it here by violating transaction semantics.)
984 985 986 987 988 989 990 991
		 *
		 * Note: the inVacuum flag remains set until CommitTransaction or
		 * AbortTransaction.  We don't want to clear it until we reset
		 * MyProc->xid/xmin, else OldestXmin might appear to go backwards,
		 * which is probably Not Good.
		 */
		MyProc->inVacuum = true;
	}
992

993
	/*
B
Bruce Momjian 已提交
994
	 * Check for user-requested abort.	Note we want this to be inside a
B
Bruce Momjian 已提交
995
	 * transaction, so xact.c doesn't issue useless WARNING.
996
	 */
997
	CHECK_FOR_INTERRUPTS();
998

999
	/*
B
Bruce Momjian 已提交
1000
	 * Determine the type of lock we want --- hard exclusive lock for a FULL
1001 1002
	 * vacuum, but just ShareUpdateExclusiveLock for concurrent vacuum. Either
	 * way, we can be sure that no other backend is vacuuming the same table.
1003 1004 1005 1006
	 */
	lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock;

	/*
1007 1008
	 * Open the relation and get the appropriate lock on it.
	 *
B
Bruce Momjian 已提交
1009 1010
	 * There's a race condition here: the rel may have gone away since the
	 * last time we saw it.  If so, we don't need to vacuum it.
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
	 */
	onerel = try_relation_open(relid, lmode);

	if (!onerel)
	{
		CommitTransactionCommand();
		return;
	}

	/*
	 * Check permissions.
1022
	 *
1023 1024 1025
	 * We allow the user to vacuum a table if he is superuser, the table
	 * owner, or the database owner (but in the latter case, only if it's not
	 * a shared relation).	pg_class_ownercheck includes the superuser case.
1026
	 *
1027 1028
	 * Note we choose to treat permissions failure as a WARNING and keep
	 * trying to vacuum the rest of the DB --- is this appropriate?
1029
	 */
1030
	if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) ||
1031
		  (pg_database_ownercheck(MyDatabaseId, GetUserId()) && !onerel->rd_rel->relisshared)))
1032
	{
1033
		ereport(WARNING,
1034
				(errmsg("skipping \"%s\" --- only table or database owner can vacuum it",
1035
						RelationGetRelationName(onerel))));
1036
		relation_close(onerel, lmode);
1037
		CommitTransactionCommand();
1038
		return;
1039 1040 1041
	}

	/*
B
Bruce Momjian 已提交
1042 1043
	 * Check that it's a plain table; we used to do this in get_rel_oids() but
	 * seems safer to check after we've locked the relation.
1044 1045 1046
	 */
	if (onerel->rd_rel->relkind != expected_relkind)
	{
1047
		ereport(WARNING,
1048
				(errmsg("skipping \"%s\" --- cannot vacuum indexes, views, or special system tables",
1049
						RelationGetRelationName(onerel))));
1050
		relation_close(onerel, lmode);
1051
		CommitTransactionCommand();
1052
		return;
1053 1054
	}

1055 1056 1057 1058 1059 1060 1061 1062 1063 1064
	/*
	 * Silently ignore tables that are temp tables of other backends ---
	 * trying to vacuum these will lead to great unhappiness, since their
	 * contents are probably not up-to-date on disk.  (We don't throw a
	 * warning here; it would just lead to chatter during a database-wide
	 * VACUUM.)
	 */
	if (isOtherTempNamespace(RelationGetNamespace(onerel)))
	{
		relation_close(onerel, lmode);
1065
		CommitTransactionCommand();
1066
		return;
1067 1068
	}

1069
	/*
1070 1071
	 * Get a session-level lock too. This will protect our access to the
	 * relation across multiple transactions, so that we can vacuum the
B
Bruce Momjian 已提交
1072 1073
	 * relation's TOAST table (if any) secure in the knowledge that no one is
	 * deleting the parent relation.
1074 1075 1076 1077 1078 1079
	 *
	 * NOTE: this cannot block, even if someone else is waiting for access,
	 * because the lock manager knows that both lock requests are from the
	 * same process.
	 */
	onerelid = onerel->rd_lockInfo.lockRelId;
1080
	LockRelationIdForSession(&onerelid, lmode);
1081

1082 1083 1084
	/*
	 * Remember the relation's TOAST relation for later
	 */
1085 1086
	toast_relid = onerel->rd_rel->reltoastrelid;

1087 1088 1089 1090 1091 1092
	/*
	 * Tell the cache replacement strategy that vacuum is causing all
	 * following IO
	 */
	StrategyHintVacuum(true);

1093 1094 1095
	/*
	 * Do the actual work --- either FULL or "lazy" vacuum
	 */
1096
	if (vacstmt->full)
1097
		full_vacuum_rel(onerel, vacstmt);
1098
	else
1099
		lazy_vacuum_rel(onerel, vacstmt);
1100

1101 1102
	StrategyHintVacuum(false);

1103
	/* all done with this class, but hold lock until commit */
1104
	relation_close(onerel, NoLock);
1105

1106 1107 1108
	/*
	 * Complete the transaction and free all temporary memory used.
	 */
1109
	CommitTransactionCommand();
1110 1111 1112 1113

	/*
	 * If the relation has a secondary toast rel, vacuum that too while we
	 * still hold the session lock on the master table.  Note however that
B
Bruce Momjian 已提交
1114 1115 1116
	 * "analyze" will not get done on the toast table.	This is good, because
	 * the toaster always uses hardcoded index access and statistics are
	 * totally unimportant for toast relations.
1117 1118
	 */
	if (toast_relid != InvalidOid)
1119
		vacuum_rel(toast_relid, vacstmt, RELKIND_TOASTVALUE);
1120

1121 1122 1123
	/*
	 * Now release the session-level lock on the master table.
	 */
1124
	UnlockRelationIdForSession(&onerelid, lmode);
1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138
}


/****************************************************************************
 *																			*
 *			Code for VACUUM FULL (only)										*
 *																			*
 ****************************************************************************
 */


/*
 *	full_vacuum_rel() -- perform FULL VACUUM for one heap relation
 *
1139
 *		This routine vacuums a single heap, cleans out its indexes, and
1140 1141 1142 1143 1144 1145
 *		updates its num_pages and num_tuples statistics.
 *
 *		At entry, we have already established a transaction and opened
 *		and locked the relation.
 */
static void
1146
full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
1147 1148
{
	VacPageListData vacuum_pages;		/* List of pages to vacuum and/or
1149
										 * clean indexes */
B
Bruce Momjian 已提交
1150 1151
	VacPageListData fraged_pages;		/* List of pages with space enough for
										 * re-using */
1152
	Relation   *Irel;
1153
	int			nindexes,
1154 1155 1156
				i;
	VRelStats  *vacrelstats;

1157 1158
	vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
						  &OldestXmin, &FreezeLimit);
1159

1160 1161 1162
	/*
	 * Set up statistics-gathering machinery.
	 */
1163
	vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
1164 1165
	vacrelstats->rel_pages = 0;
	vacrelstats->rel_tuples = 0;
1166
	vacrelstats->hasindex = false;
B
Bruce Momjian 已提交
1167

1168
	/* scan the heap */
B
Bruce Momjian 已提交
1169
	vacuum_pages.num_pages = fraged_pages.num_pages = 0;
1170
	scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
1171

1172
	/* Now open all indexes of the relation */
1173
	vac_open_indexes(onerel, AccessExclusiveLock, &nindexes, &Irel);
1174
	if (nindexes > 0)
1175
		vacrelstats->hasindex = true;
B
Bruce Momjian 已提交
1176

1177
	/* Clean/scan index relation(s) */
1178
	if (Irel != NULL)
1179
	{
B
Bruce Momjian 已提交
1180
		if (vacuum_pages.num_pages > 0)
1181
		{
1182
			for (i = 0; i < nindexes; i++)
1183
				vacuum_index(&vacuum_pages, Irel[i],
1184
							 vacrelstats->rel_tuples, 0);
1185 1186 1187
		}
		else
		{
1188 1189
			/* just scan indexes to update statistic */
			for (i = 0; i < nindexes; i++)
1190
				scan_index(Irel[i], vacrelstats->rel_tuples);
1191 1192 1193
		}
	}

1194 1195 1196 1197
	if (fraged_pages.num_pages > 0)
	{
		/* Try to shrink heap */
		repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
1198
					nindexes, Irel);
1199
		vac_close_indexes(nindexes, Irel, NoLock);
1200
	}
1201 1202
	else
	{
1203
		vac_close_indexes(nindexes, Irel, NoLock);
1204 1205 1206
		if (vacuum_pages.num_pages > 0)
		{
			/* Clean pages from vacuum_pages list */
B
Bruce Momjian 已提交
1207
			vacuum_heap(vacrelstats, onerel, &vacuum_pages);
1208
		}
1209
	}
1210

1211 1212 1213
	/* update shared free space map with final free space info */
	vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);

1214
	/* update statistics in pg_class */
1215
	vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
1216
						vacrelstats->rel_tuples, vacrelstats->hasindex,
1217
						FreezeLimit);
1218 1219

	/* report results to the stats collector, too */
1220
	pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
B
Bruce Momjian 已提交
1221
						 vacstmt->analyze, vacrelstats->rel_tuples);
1222 1223
}

1224

1225
/*
B
Bruce Momjian 已提交
1226
 *	scan_heap() -- scan an open heap relation
1227
 *
1228 1229 1230 1231
 *		This routine sets commit status bits, constructs vacuum_pages (list
 *		of pages we need to compact free space on and/or clean indexes of
 *		deleted tuples), constructs fraged_pages (list of pages with free
 *		space that tuples could be moved into), and calculates statistics
1232
 *		on the number of live tuples in the heap.
1233 1234
 */
static void
B
Bruce Momjian 已提交
1235
scan_heap(VRelStats *vacrelstats, Relation onerel,
1236
		  VacPageList vacuum_pages, VacPageList fraged_pages)
1237
{
1238
	BlockNumber nblocks,
1239 1240
				blkno;
	char	   *relname;
B
Bruce Momjian 已提交
1241
	VacPage		vacpage;
1242
	BlockNumber empty_pages,
B
Bruce Momjian 已提交
1243
				empty_end_pages;
1244 1245 1246
	double		num_tuples,
				tups_vacuumed,
				nkeep,
1247
				nunused;
1248 1249
	double		free_space,
				usable_free_space;
1250
	Size		min_tlen = MaxHeapTupleSize;
1251 1252
	Size		max_tlen = 0;
	bool		do_shrinking = true;
1253 1254 1255
	VTupleLink	vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
	int			num_vtlinks = 0;
	int			free_vtlinks = 100;
1256
	PGRUsage	ru0;
1257

1258
	pg_rusage_init(&ru0);
1259

1260
	relname = RelationGetRelationName(onerel);
1261 1262 1263 1264
	ereport(elevel,
			(errmsg("vacuuming \"%s.%s\"",
					get_namespace_name(RelationGetNamespace(onerel)),
					relname)));
1265

1266
	empty_pages = empty_end_pages = 0;
1267
	num_tuples = tups_vacuumed = nkeep = nunused = 0;
1268
	free_space = 0;
1269 1270 1271

	nblocks = RelationGetNumberOfBlocks(onerel);

1272 1273 1274 1275
	/*
	 * We initially create each VacPage item in a maximal-sized workspace,
	 * then copy the workspace into a just-large-enough copy.
	 */
B
Bruce Momjian 已提交
1276
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1277 1278 1279

	for (blkno = 0; blkno < nblocks; blkno++)
	{
1280 1281 1282 1283
		Page		page,
					tempPage = NULL;
		bool		do_reap,
					do_frag;
B
Bruce Momjian 已提交
1284 1285 1286
		Buffer		buf;
		OffsetNumber offnum,
					maxoff;
1287 1288 1289
		bool		notup;
		OffsetNumber frozen[MaxOffsetNumber];
		int			nfrozen;
1290

1291
		vacuum_delay_point();
1292

1293 1294
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);
1295

1296
		/*
1297
		 * Since we are holding exclusive lock on the relation, no other
B
Bruce Momjian 已提交
1298 1299 1300 1301 1302
		 * backend can be accessing the page; however it is possible that the
		 * background writer will try to write the page if it's already marked
		 * dirty.  To ensure that invalid data doesn't get written to disk, we
		 * must take exclusive buffer lock wherever we potentially modify
		 * pages.
1303
		 */
1304
		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1305

B
Bruce Momjian 已提交
1306
		vacpage->blkno = blkno;
1307
		vacpage->offsets_used = 0;
B
Bruce Momjian 已提交
1308
		vacpage->offsets_free = 0;
1309

1310 1311
		if (PageIsNew(page))
		{
B
Bruce Momjian 已提交
1312
			VacPage		vacpagecopy;
B
Bruce Momjian 已提交
1313

1314
			ereport(WARNING,
B
Bruce Momjian 已提交
1315 1316
			   (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
					   relname, blkno)));
1317
			PageInit(page, BufferGetPageSize(buf), 0);
1318
			MarkBufferDirty(buf);
B
Bruce Momjian 已提交
1319
			vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
1320 1321
			free_space += vacpage->free;
			empty_pages++;
B
Bruce Momjian 已提交
1322
			empty_end_pages++;
1323 1324 1325
			vacpagecopy = copy_vac_page(vacpage);
			vpage_insert(vacuum_pages, vacpagecopy);
			vpage_insert(fraged_pages, vacpagecopy);
1326
			UnlockReleaseBuffer(buf);
1327
			continue;
V
Vadim B. Mikheev 已提交
1328
		}
1329 1330

		if (PageIsEmpty(page))
1331
		{
B
Bruce Momjian 已提交
1332
			VacPage		vacpagecopy;
B
Bruce Momjian 已提交
1333

B
Bruce Momjian 已提交
1334
			vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
1335
			free_space += vacpage->free;
B
Bruce Momjian 已提交
1336
			empty_pages++;
B
Bruce Momjian 已提交
1337
			empty_end_pages++;
1338 1339 1340
			vacpagecopy = copy_vac_page(vacpage);
			vpage_insert(vacuum_pages, vacpagecopy);
			vpage_insert(fraged_pages, vacpagecopy);
1341
			UnlockReleaseBuffer(buf);
1342
			continue;
1343 1344
		}

1345
		nfrozen = 0;
1346 1347 1348 1349 1350
		notup = true;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
1351
		{
B
Bruce Momjian 已提交
1352 1353
			ItemId		itemid = PageGetItemId(page, offnum);
			bool		tupgone = false;
1354
			HeapTupleData tuple;
1355 1356

			/*
1357
			 * Collect un-used items too - it's possible to have indexes
1358 1359 1360 1361
			 * pointing here after crash.
			 */
			if (!ItemIdIsUsed(itemid))
			{
B
Bruce Momjian 已提交
1362
				vacpage->offsets[vacpage->offsets_free++] = offnum;
1363
				nunused += 1;
1364 1365 1366
				continue;
			}

1367 1368 1369
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
1370

1371
			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
1372
			{
1373
				case HEAPTUPLE_DEAD:
1374
					tupgone = true;		/* we can delete the tuple */
1375 1376
					break;
				case HEAPTUPLE_LIVE:
1377
					/* Tuple is good --- but let's do some validity checks */
1378 1379 1380 1381
					if (onerel->rd_rel->relhasoids &&
						!OidIsValid(HeapTupleGetOid(&tuple)))
						elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
							 relname, blkno, offnum);
1382 1383
					break;
				case HEAPTUPLE_RECENTLY_DEAD:
1384

1385
					/*
B
Bruce Momjian 已提交
1386 1387
					 * If tuple is recently deleted then we must not remove it
					 * from relation.
1388
					 */
1389
					nkeep += 1;
1390

1391
					/*
B
Bruce Momjian 已提交
1392 1393
					 * If we do shrinking and this tuple is updated one then
					 * remember it to construct updated tuple dependencies.
1394
					 */
1395 1396 1397
					if (do_shrinking &&
						!(ItemPointerEquals(&(tuple.t_self),
											&(tuple.t_data->t_ctid))))
1398 1399 1400 1401
					{
						if (free_vtlinks == 0)
						{
							free_vtlinks = 1000;
B
Bruce Momjian 已提交
1402
							vtlinks = (VTupleLink) repalloc(vtlinks,
B
Bruce Momjian 已提交
1403 1404
											   (free_vtlinks + num_vtlinks) *
													 sizeof(VTupleLinkData));
1405 1406 1407 1408 1409 1410
						}
						vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
						vtlinks[num_vtlinks].this_tid = tuple.t_self;
						free_vtlinks--;
						num_vtlinks++;
					}
1411 1412
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:
1413

1414
					/*
B
Bruce Momjian 已提交
1415 1416 1417 1418
					 * This should not happen, since we hold exclusive lock on
					 * the relation; shouldn't we raise an error? (Actually,
					 * it can happen in system catalogs, since we tend to
					 * release write lock before commit there.)
1419
					 */
1420
					ereport(NOTICE,
1421
							(errmsg("relation \"%s\" TID %u/%u: InsertTransactionInProgress %u --- cannot shrink relation",
1422
									relname, blkno, offnum, HeapTupleHeaderGetXmin(tuple.t_data))));
1423 1424 1425
					do_shrinking = false;
					break;
				case HEAPTUPLE_DELETE_IN_PROGRESS:
1426

1427
					/*
B
Bruce Momjian 已提交
1428 1429 1430 1431
					 * This should not happen, since we hold exclusive lock on
					 * the relation; shouldn't we raise an error? (Actually,
					 * it can happen in system catalogs, since we tend to
					 * release write lock before commit there.)
1432
					 */
1433
					ereport(NOTICE,
1434
							(errmsg("relation \"%s\" TID %u/%u: DeleteTransactionInProgress %u --- cannot shrink relation",
1435
									relname, blkno, offnum, HeapTupleHeaderGetXmax(tuple.t_data))));
1436 1437 1438
					do_shrinking = false;
					break;
				default:
1439
					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1440
					break;
1441 1442 1443 1444
			}

			if (tupgone)
			{
1445
				ItemId		lpp;
1446

1447
				/*
B
Bruce Momjian 已提交
1448 1449
				 * Here we are building a temporary copy of the page with dead
				 * tuples removed.	Below we will apply
1450
				 * PageRepairFragmentation to the copy, so that we can
B
Bruce Momjian 已提交
1451 1452 1453
				 * determine how much space will be available after removal of
				 * dead tuples.  But note we are NOT changing the real page
				 * yet...
1454
				 */
1455
				if (tempPage == NULL)
1456
				{
1457
					Size		pageSize;
1458 1459 1460

					pageSize = PageGetPageSize(page);
					tempPage = (Page) palloc(pageSize);
1461
					memcpy(tempPage, page, pageSize);
1462 1463
				}

1464
				/* mark it unused on the temp page */
1465
				lpp = PageGetItemId(tempPage, offnum);
1466 1467
				lpp->lp_flags &= ~LP_USED;

B
Bruce Momjian 已提交
1468
				vacpage->offsets[vacpage->offsets_free++] = offnum;
1469
				tups_vacuumed += 1;
1470 1471 1472
			}
			else
			{
1473
				num_tuples += 1;
1474
				notup = false;
1475 1476 1477 1478
				if (tuple.t_len < min_tlen)
					min_tlen = tuple.t_len;
				if (tuple.t_len > max_tlen)
					max_tlen = tuple.t_len;
1479

B
Bruce Momjian 已提交
1480
				/*
1481 1482
				 * Each non-removable tuple must be checked to see if it
				 * needs freezing.
1483
				 */
1484 1485 1486
				if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
									  InvalidBuffer))
					frozen[nfrozen++] = offnum;
1487
			}
1488
		}						/* scan along page */
1489

1490
		if (tempPage != NULL)
1491 1492
		{
			/* Some tuples are removable; figure free space after removal */
1493
			PageRepairFragmentation(tempPage, NULL);
B
Bruce Momjian 已提交
1494
			vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, tempPage);
1495
			pfree(tempPage);
1496
			do_reap = true;
1497
		}
1498 1499 1500
		else
		{
			/* Just use current available space */
B
Bruce Momjian 已提交
1501
			vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page);
1502 1503
			/* Need to reap the page if it has ~LP_USED line pointers */
			do_reap = (vacpage->offsets_free > 0);
V
Vadim B. Mikheev 已提交
1504
		}
1505

1506
		free_space += vacpage->free;
1507

1508 1509
		/*
		 * Add the page to fraged_pages if it has a useful amount of free
1510
		 * space.  "Useful" means enough for a minimal-sized tuple. But we
B
Bruce Momjian 已提交
1511 1512
		 * don't know that accurately near the start of the relation, so add
		 * pages unconditionally if they have >= BLCKSZ/10 free space.
1513
		 */
1514
		do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10);
1515 1516

		if (do_reap || do_frag)
1517
		{
B
Bruce Momjian 已提交
1518 1519
			VacPage		vacpagecopy = copy_vac_page(vacpage);

1520 1521 1522 1523
			if (do_reap)
				vpage_insert(vacuum_pages, vacpagecopy);
			if (do_frag)
				vpage_insert(fraged_pages, vacpagecopy);
1524 1525
		}

1526 1527
		/*
		 * Include the page in empty_end_pages if it will be empty after
B
Bruce Momjian 已提交
1528
		 * vacuuming; this is to keep us from using it as a move destination.
1529
		 */
1530
		if (notup)
1531 1532
		{
			empty_pages++;
B
Bruce Momjian 已提交
1533
			empty_end_pages++;
1534
		}
1535
		else
B
Bruce Momjian 已提交
1536
			empty_end_pages = 0;
1537

1538 1539 1540 1541 1542 1543 1544
		/*
		 * If we froze any tuples, mark the buffer dirty, and write a WAL
		 * record recording the changes.  We must log the changes to be
		 * crash-safe against future truncation of CLOG.
		 */
		if (nfrozen > 0)
		{
1545
			MarkBufferDirty(buf);
1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557
			/* no XLOG for temp tables, though */
			if (!onerel->rd_istemp)
			{
				XLogRecPtr	recptr;

				recptr = log_heap_freeze(onerel, buf, FreezeLimit,
										 frozen, nfrozen);
				PageSetLSN(page, recptr);
				PageSetTLI(page, ThisTimeLineID);
			}
		}

1558
		UnlockReleaseBuffer(buf);
1559 1560
	}

B
Bruce Momjian 已提交
1561
	pfree(vacpage);
1562 1563

	/* save stats in the rel list for use later */
1564 1565
	vacrelstats->rel_tuples = num_tuples;
	vacrelstats->rel_pages = nblocks;
B
Bruce Momjian 已提交
1566
	if (num_tuples == 0)
1567 1568 1569 1570
		min_tlen = max_tlen = 0;
	vacrelstats->min_tlen = min_tlen;
	vacrelstats->max_tlen = max_tlen;

B
Bruce Momjian 已提交
1571 1572
	vacuum_pages->empty_end_pages = empty_end_pages;
	fraged_pages->empty_end_pages = empty_end_pages;
1573 1574

	/*
1575 1576 1577
	 * Clear the fraged_pages list if we found we couldn't shrink. Else,
	 * remove any "empty" end-pages from the list, and compute usable free
	 * space = free space in remaining pages.
1578
	 */
1579
	if (do_shrinking)
1580
	{
B
Bruce Momjian 已提交
1581 1582
		int			i;

1583 1584
		Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
		fraged_pages->num_pages -= empty_end_pages;
1585
		usable_free_space = 0;
1586
		for (i = 0; i < fraged_pages->num_pages; i++)
1587
			usable_free_space += fraged_pages->pagedesc[i]->free;
1588 1589 1590 1591
	}
	else
	{
		fraged_pages->num_pages = 0;
1592
		usable_free_space = 0;
V
Vadim B. Mikheev 已提交
1593
	}
1594

1595 1596
	/* don't bother to save vtlinks if we will not call repair_frag */
	if (fraged_pages->num_pages > 0 && num_vtlinks > 0)
1597
	{
B
Bruce Momjian 已提交
1598
		qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
B
Bruce Momjian 已提交
1599
			  vac_cmp_vtlinks);
1600 1601 1602 1603 1604 1605 1606 1607 1608 1609
		vacrelstats->vtlinks = vtlinks;
		vacrelstats->num_vtlinks = num_vtlinks;
	}
	else
	{
		vacrelstats->vtlinks = NULL;
		vacrelstats->num_vtlinks = 0;
		pfree(vtlinks);
	}

1610
	ereport(elevel,
1611
			(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
1612 1613
					RelationGetRelationName(onerel),
					tups_vacuumed, num_tuples, nblocks),
1614
			 errdetail("%.0f dead row versions cannot be removed yet.\n"
B
Bruce Momjian 已提交
1615
			  "Nonremovable row versions range from %lu to %lu bytes long.\n"
1616
					   "There were %.0f unused item pointers.\n"
B
Bruce Momjian 已提交
1617
	   "Total free space (including removable row versions) is %.0f bytes.\n"
1618
					   "%u pages are or will become empty, including %u at the end of the table.\n"
B
Bruce Momjian 已提交
1619
	 "%u pages containing %.0f free bytes are potential move destinations.\n"
1620
					   "%s.",
1621 1622 1623 1624 1625 1626
					   nkeep,
					   (unsigned long) min_tlen, (unsigned long) max_tlen,
					   nunused,
					   free_space,
					   empty_pages, empty_end_pages,
					   fraged_pages->num_pages, usable_free_space,
1627
					   pg_rusage_show(&ru0))));
B
Bruce Momjian 已提交
1628
}
V
Vadim B. Mikheev 已提交
1629

1630 1631

/*
B
Bruce Momjian 已提交
1632
 *	repair_frag() -- try to repair relation's fragmentation
1633
 *
1634
 *		This routine marks dead tuples as unused and tries re-use dead space
1635 1636
 *		by moving tuples (and inserting indexes if needed). It constructs
 *		Nvacpagelist list of free-ed pages (moved tuples) and clean indexes
1637 1638 1639
 *		for them after committing (in hack-manner - without losing locks
 *		and freeing memory!) current transaction. It truncates relation
 *		if some end-blocks are gone away.
1640 1641
 */
static void
B
Bruce Momjian 已提交
1642
repair_frag(VRelStats *vacrelstats, Relation onerel,
B
Bruce Momjian 已提交
1643
			VacPageList vacuum_pages, VacPageList fraged_pages,
1644
			int nindexes, Relation *Irel)
1645
{
B
Bruce Momjian 已提交
1646 1647
	TransactionId myXID = GetCurrentTransactionId();
	Buffer		dst_buffer = InvalidBuffer;
1648
	BlockNumber nblocks,
1649
				blkno;
1650
	BlockNumber last_move_dest_block = 0,
1651
				last_vacuum_block;
B
Bruce Momjian 已提交
1652
	Page		dst_page = NULL;
B
Bruce Momjian 已提交
1653
	ExecContextData ec;
B
Bruce Momjian 已提交
1654
	VacPageListData Nvacpagelist;
B
Bruce Momjian 已提交
1655
	VacPage		dst_vacpage = NULL,
B
Bruce Momjian 已提交
1656
				last_vacuum_page,
B
Bruce Momjian 已提交
1657 1658
				vacpage,
			   *curpage;
1659
	int			i;
B
Bruce Momjian 已提交
1660
	int			num_moved = 0,
B
Bruce Momjian 已提交
1661 1662
				num_fraged_pages,
				vacuumed_pages;
B
Bruce Momjian 已提交
1663
	int			keep_tuples = 0;
1664
	PGRUsage	ru0;
1665

1666
	pg_rusage_init(&ru0);
1667

B
Bruce Momjian 已提交
1668
	ExecContext_Init(&ec, onerel);
1669

B
Bruce Momjian 已提交
1670 1671
	Nvacpagelist.num_pages = 0;
	num_fraged_pages = fraged_pages->num_pages;
1672
	Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
B
Bruce Momjian 已提交
1673
	vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
	if (vacuumed_pages > 0)
	{
		/* get last reaped page from vacuum_pages */
		last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
		last_vacuum_block = last_vacuum_page->blkno;
	}
	else
	{
		last_vacuum_page = NULL;
		last_vacuum_block = InvalidBlockNumber;
	}
1685

B
Bruce Momjian 已提交
1686 1687
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
	vacpage->offsets_used = vacpage->offsets_free = 0;
1688

1689
	/*
B
Bruce Momjian 已提交
1690 1691 1692 1693 1694
	 * Scan pages backwards from the last nonempty page, trying to move tuples
	 * down to lower pages.  Quit when we reach a page that we have moved any
	 * tuples onto, or the first page if we haven't moved anything, or when we
	 * find a page we cannot completely empty (this last condition is handled
	 * by "break" statements within the loop).
1695
	 *
1696 1697
	 * NB: this code depends on the vacuum_pages and fraged_pages lists being
	 * in order by blkno.
1698
	 */
1699
	nblocks = vacrelstats->rel_pages;
B
Bruce Momjian 已提交
1700
	for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1701 1702
		 blkno > last_move_dest_block;
		 blkno--)
V
Vadim B. Mikheev 已提交
1703
	{
B
Bruce Momjian 已提交
1704 1705 1706 1707 1708 1709
		Buffer		buf;
		Page		page;
		OffsetNumber offnum,
					maxoff;
		bool		isempty,
					chain_tuple_moved;
B
Bruce Momjian 已提交
1710

1711
		vacuum_delay_point();
1712

1713
		/*
B
Bruce Momjian 已提交
1714 1715 1716 1717
		 * Forget fraged_pages pages at or after this one; they're no longer
		 * useful as move targets, since we only want to move down. Note that
		 * since we stop the outer loop at last_move_dest_block, pages removed
		 * here cannot have had anything moved onto them already.
1718
		 *
1719 1720 1721
		 * Also note that we don't change the stored fraged_pages list, only
		 * our local variable num_fraged_pages; so the forgotten pages are
		 * still available to be loaded into the free space map later.
1722 1723
		 */
		while (num_fraged_pages > 0 &&
B
Bruce Momjian 已提交
1724
			   fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno)
1725
		{
1726
			Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0);
1727 1728 1729
			--num_fraged_pages;
		}

1730 1731 1732
		/*
		 * Process this page of relation.
		 */
1733 1734 1735
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);

B
Bruce Momjian 已提交
1736
		vacpage->offsets_free = 0;
1737 1738 1739

		isempty = PageIsEmpty(page);

1740 1741
		/* Is the page in the vacuum_pages list? */
		if (blkno == last_vacuum_block)
1742
		{
1743 1744 1745
			if (last_vacuum_page->offsets_free > 0)
			{
				/* there are dead tuples on this page - clean them */
1746
				Assert(!isempty);
1747 1748 1749
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				vacuum_page(onerel, buf, last_vacuum_page);
				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1750 1751 1752
			}
			else
				Assert(isempty);
B
Bruce Momjian 已提交
1753
			--vacuumed_pages;
1754 1755
			if (vacuumed_pages > 0)
			{
B
Bruce Momjian 已提交
1756
				/* get prev reaped page from vacuum_pages */
B
Bruce Momjian 已提交
1757 1758
				last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
				last_vacuum_block = last_vacuum_page->blkno;
1759 1760
			}
			else
1761
			{
1762
				last_vacuum_page = NULL;
1763
				last_vacuum_block = InvalidBlockNumber;
1764
			}
1765 1766 1767 1768 1769 1770 1771 1772 1773
			if (isempty)
			{
				ReleaseBuffer(buf);
				continue;
			}
		}
		else
			Assert(!isempty);

B
Bruce Momjian 已提交
1774 1775
		chain_tuple_moved = false;		/* no one chain-tuple was moved off
										 * this page, yet */
B
Bruce Momjian 已提交
1776
		vacpage->blkno = blkno;
1777 1778 1779 1780 1781
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
B
Bruce Momjian 已提交
1782 1783 1784
			Size		tuple_len;
			HeapTupleData tuple;
			ItemId		itemid = PageGetItemId(page, offnum);
1785 1786 1787 1788

			if (!ItemIdIsUsed(itemid))
				continue;

1789 1790 1791
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple_len = tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
1792

B
Bruce Momjian 已提交
1793
			/* ---
1794 1795
			 * VACUUM FULL has an exclusive lock on the relation.  So
			 * normally no other transaction can have pending INSERTs or
B
Bruce Momjian 已提交
1796 1797 1798 1799 1800 1801 1802 1803 1804 1805
			 * DELETEs in this relation.  A tuple is either:
			 *		(a) a tuple in a system catalog, inserted or deleted
			 *			by a not yet committed transaction
			 *		(b) known dead (XMIN_INVALID, or XMAX_COMMITTED and xmax
			 *			is visible to all active transactions)
			 *		(c) inserted by a committed xact (XMIN_COMMITTED)
			 *		(d) moved by the currently running VACUUM.
			 *		(e) deleted (XMAX_COMMITTED) but at least one active
			 *			transaction does not see the deleting transaction
			 * In case (a) we wouldn't be in repair_frag() at all.
1806 1807 1808 1809 1810
			 * In case (b) we cannot be here, because scan_heap() has
			 * already marked the item as unused, see continue above. Case
			 * (c) is what normally is to be expected. Case (d) is only
			 * possible, if a whole tuple chain has been moved while
			 * processing this or a higher numbered block.
B
Bruce Momjian 已提交
1811
			 * ---
B
Bruce Momjian 已提交
1812
			 */
1813 1814
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
			{
1815 1816 1817 1818 1819
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
					elog(ERROR, "HEAP_MOVED_IN was not expected");
				if (!(tuple.t_data->t_infomask & HEAP_MOVED_OFF))
					elog(ERROR, "HEAP_MOVED_OFF was expected");

B
Bruce Momjian 已提交
1820
				/*
1821 1822
				 * MOVED_OFF by another VACUUM would have caused the
				 * visibility check to set XMIN_COMMITTED or XMIN_INVALID.
B
Bruce Momjian 已提交
1823
				 */
1824 1825
				if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
					elog(ERROR, "invalid XVAC in tuple header");
B
Bruce Momjian 已提交
1826 1827

				/*
B
Bruce Momjian 已提交
1828 1829 1830
				 * If this (chain) tuple is moved by me already then I have to
				 * check is it in vacpage or not - i.e. is it moved while
				 * cleaning this page or some previous one.
1831
				 */
B
Bruce Momjian 已提交
1832 1833 1834 1835

				/* Can't we Assert(keep_tuples > 0) here? */
				if (keep_tuples == 0)
					continue;
1836 1837 1838
				if (chain_tuple_moved)
				{
					/* some chains were moved while cleaning this page */
B
Bruce Momjian 已提交
1839 1840 1841 1842 1843
					Assert(vacpage->offsets_free > 0);
					for (i = 0; i < vacpage->offsets_free; i++)
					{
						if (vacpage->offsets[i] == offnum)
							break;
1844
					}
B
Bruce Momjian 已提交
1845
					if (i >= vacpage->offsets_free)		/* not found */
1846
					{
B
Bruce Momjian 已提交
1847
						vacpage->offsets[vacpage->offsets_free++] = offnum;
1848 1849 1850
						keep_tuples--;
					}
				}
B
Bruce Momjian 已提交
1851 1852 1853 1854 1855 1856
				else
				{
					vacpage->offsets[vacpage->offsets_free++] = offnum;
					keep_tuples--;
				}
				continue;
1857 1858 1859
			}

			/*
B
Bruce Momjian 已提交
1860 1861 1862 1863
			 * If this tuple is in a chain of tuples created in updates by
			 * "recent" transactions then we have to move the whole chain of
			 * tuples to other places, so that we can write new t_ctid links
			 * that preserve the chain relationship.
1864 1865
			 *
			 * This test is complicated.  Read it as "if tuple is a recently
B
Bruce Momjian 已提交
1866 1867 1868 1869
			 * created updated version, OR if it is an obsoleted version". (In
			 * the second half of the test, we needn't make any check on XMAX
			 * --- it must be recently obsoleted, else scan_heap would have
			 * deemed it removable.)
1870
			 *
1871 1872 1873 1874 1875 1876
			 * NOTE: this test is not 100% accurate: it is possible for a
			 * tuple to be an updated one with recent xmin, and yet not match
			 * any new_tid entry in the vtlinks list.  Presumably there was
			 * once a parent tuple with xmax matching the xmin, but it's
			 * possible that that tuple has been removed --- for example, if
			 * it had xmin = xmax and wasn't itself an updated version, then
B
Bruce Momjian 已提交
1877 1878
			 * HeapTupleSatisfiesVacuum would deem it removable as soon as the
			 * xmin xact completes.
1879
			 *
1880 1881
			 * To be on the safe side, we abandon the repair_frag process if
			 * we cannot find the parent tuple in vtlinks.	This may be overly
B
Bruce Momjian 已提交
1882
			 * conservative; AFAICS it would be safe to move the chain.
1883 1884 1885 1886 1887 1888 1889 1890 1891
			 *
			 * Also, because we distinguish DEAD and RECENTLY_DEAD tuples
			 * using OldestXmin, which is a rather coarse test, it is quite
			 * possible to have an update chain in which a tuple we think is
			 * RECENTLY_DEAD links forward to one that is definitely DEAD.
			 * In such a case the RECENTLY_DEAD tuple must actually be dead,
			 * but it seems too complicated to try to make VACUUM remove it.
			 * We treat each contiguous set of RECENTLY_DEAD tuples as a
			 * separately movable chain, ignoring any intervening DEAD ones.
1892
			 */
1893
			if (((tuple.t_data->t_infomask & HEAP_UPDATED) &&
B
Bruce Momjian 已提交
1894 1895
				 !TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
										OldestXmin)) ||
1896
				(!(tuple.t_data->t_infomask & (HEAP_XMAX_INVALID |
1897
											   HEAP_IS_LOCKED)) &&
1898 1899
				 !(ItemPointerEquals(&(tuple.t_self),
									 &(tuple.t_data->t_ctid)))))
1900
			{
B
Bruce Momjian 已提交
1901
				Buffer		Cbuf = buf;
1902 1903
				bool		freeCbuf = false;
				bool		chain_move_failed = false;
1904
				bool		moved_target = false;
B
Bruce Momjian 已提交
1905 1906 1907
				ItemPointerData Ctid;
				HeapTupleData tp = tuple;
				Size		tlen = tuple_len;
1908 1909 1910
				VTupleMove	vtmove;
				int			num_vtmove;
				int			free_vtmove;
B
Bruce Momjian 已提交
1911
				VacPage		to_vacpage = NULL;
B
Bruce Momjian 已提交
1912 1913
				int			to_item = 0;
				int			ti;
1914

B
Bruce Momjian 已提交
1915
				if (dst_buffer != InvalidBuffer)
1916
				{
1917
					ReleaseBuffer(dst_buffer);
B
Bruce Momjian 已提交
1918
					dst_buffer = InvalidBuffer;
1919
				}
B
Bruce Momjian 已提交
1920

1921 1922 1923
				/* Quick exit if we have no vtlinks to search in */
				if (vacrelstats->vtlinks == NULL)
				{
1924
					elog(DEBUG2, "parent item in update-chain not found --- cannot continue repair_frag");
B
Bruce Momjian 已提交
1925
					break;		/* out of walk-along-page loop */
1926 1927
				}

1928
				/*
B
Bruce Momjian 已提交
1929 1930 1931
				 * If this tuple is in the begin/middle of the chain then we
				 * have to move to the end of chain.  As with any t_ctid
				 * chase, we have to verify that each new tuple is really the
1932 1933 1934 1935 1936 1937 1938
				 * descendant of the tuple we came from; however, here we
				 * need even more than the normal amount of paranoia.
				 * If t_ctid links forward to a tuple determined to be DEAD,
				 * then depending on where that tuple is, it might already
				 * have been removed, and perhaps even replaced by a MOVED_IN
				 * tuple.  We don't want to include any DEAD tuples in the
				 * chain, so we have to recheck HeapTupleSatisfiesVacuum.
1939
				 */
1940
				while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
1941
												  HEAP_IS_LOCKED)) &&
1942 1943
					   !(ItemPointerEquals(&(tp.t_self),
										   &(tp.t_data->t_ctid))))
1944
				{
1945 1946 1947 1948 1949 1950 1951
					ItemPointerData nextTid;
					TransactionId priorXmax;
					Buffer		nextBuf;
					Page		nextPage;
					OffsetNumber nextOffnum;
					ItemId		nextItemid;
					HeapTupleHeader nextTdata;
1952
					HTSV_Result	nextTstatus;
1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963

					nextTid = tp.t_data->t_ctid;
					priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
					/* assume block# is OK (see heap_fetch comments) */
					nextBuf = ReadBuffer(onerel,
										 ItemPointerGetBlockNumber(&nextTid));
					nextPage = BufferGetPage(nextBuf);
					/* If bogus or unused slot, assume tp is end of chain */
					nextOffnum = ItemPointerGetOffsetNumber(&nextTid);
					if (nextOffnum < FirstOffsetNumber ||
						nextOffnum > PageGetMaxOffsetNumber(nextPage))
1964
					{
1965 1966 1967 1968 1969 1970 1971 1972
						ReleaseBuffer(nextBuf);
						break;
					}
					nextItemid = PageGetItemId(nextPage, nextOffnum);
					if (!ItemIdIsUsed(nextItemid))
					{
						ReleaseBuffer(nextBuf);
						break;
1973
					}
1974 1975 1976 1977 1978 1979 1980 1981 1982
					/* if not matching XMIN, assume tp is end of chain */
					nextTdata = (HeapTupleHeader) PageGetItem(nextPage,
															  nextItemid);
					if (!TransactionIdEquals(HeapTupleHeaderGetXmin(nextTdata),
											 priorXmax))
					{
						ReleaseBuffer(nextBuf);
						break;
					}
1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
					/* must check for DEAD or MOVED_IN tuple, too */
					nextTstatus = HeapTupleSatisfiesVacuum(nextTdata,
														   OldestXmin,
														   nextBuf);
					if (nextTstatus == HEAPTUPLE_DEAD ||
						nextTstatus == HEAPTUPLE_INSERT_IN_PROGRESS)
					{
						ReleaseBuffer(nextBuf);
						break;
					}
					/* if it's MOVED_OFF we shoulda moved this one with it */
					if (nextTstatus == HEAPTUPLE_DELETE_IN_PROGRESS)
						elog(ERROR, "updated tuple is already HEAP_MOVED_OFF");
1996 1997 1998 1999
					/* OK, switch our attention to the next tuple in chain */
					tp.t_data = nextTdata;
					tp.t_self = nextTid;
					tlen = tp.t_len = ItemIdGetLength(nextItemid);
2000 2001
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
2002 2003
					Cbuf = nextBuf;
					freeCbuf = true;
2004 2005
				}

2006 2007 2008 2009 2010
				/* Set up workspace for planning the chain move */
				vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
				num_vtmove = 0;
				free_vtmove = 100;

2011
				/*
B
Bruce Momjian 已提交
2012 2013 2014
				 * Now, walk backwards up the chain (towards older tuples) and
				 * check if all items in chain can be moved.  We record all
				 * the moves that need to be made in the vtmove array.
2015
				 */
B
Bruce Momjian 已提交
2016
				for (;;)
2017
				{
2018 2019 2020
					Buffer		Pbuf;
					Page		Ppage;
					ItemId		Pitemid;
2021
					HeapTupleHeader PTdata;
2022 2023 2024
					VTupleLinkData vtld,
							   *vtlp;

2025
					/* Identify a target page to move this tuple to */
B
Bruce Momjian 已提交
2026 2027
					if (to_vacpage == NULL ||
						!enough_space(to_vacpage, tlen))
2028 2029 2030
					{
						for (i = 0; i < num_fraged_pages; i++)
						{
B
Bruce Momjian 已提交
2031
							if (enough_space(fraged_pages->pagedesc[i], tlen))
2032 2033
								break;
						}
B
Bruce Momjian 已提交
2034 2035

						if (i == num_fraged_pages)
B
Bruce Momjian 已提交
2036
						{
2037
							/* can't move item anywhere */
2038
							chain_move_failed = true;
B
Bruce Momjian 已提交
2039
							break;		/* out of check-all-items loop */
2040 2041
						}
						to_item = i;
B
Bruce Momjian 已提交
2042
						to_vacpage = fraged_pages->pagedesc[to_item];
2043
					}
B
Bruce Momjian 已提交
2044 2045
					to_vacpage->free -= MAXALIGN(tlen);
					if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
2046
						to_vacpage->free -= sizeof(ItemIdData);
B
Bruce Momjian 已提交
2047
					(to_vacpage->offsets_used)++;
2048 2049

					/* Add an entry to vtmove list */
2050 2051 2052
					if (free_vtmove == 0)
					{
						free_vtmove = 1000;
2053 2054 2055 2056
						vtmove = (VTupleMove)
							repalloc(vtmove,
									 (free_vtmove + num_vtmove) *
									 sizeof(VTupleMoveData));
2057 2058
					}
					vtmove[num_vtmove].tid = tp.t_self;
B
Bruce Momjian 已提交
2059 2060
					vtmove[num_vtmove].vacpage = to_vacpage;
					if (to_vacpage->offsets_used == 1)
2061 2062 2063 2064 2065
						vtmove[num_vtmove].cleanVpd = true;
					else
						vtmove[num_vtmove].cleanVpd = false;
					free_vtmove--;
					num_vtmove++;
B
Bruce Momjian 已提交
2066

2067 2068 2069 2070 2071
					/* Remember if we reached the original target tuple */
					if (ItemPointerGetBlockNumber(&tp.t_self) == blkno &&
						ItemPointerGetOffsetNumber(&tp.t_self) == offnum)
						moved_target = true;

2072
					/* Done if at beginning of chain */
B
Bruce Momjian 已提交
2073
					if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
B
Bruce Momjian 已提交
2074 2075 2076
					 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data),
										   OldestXmin))
						break;	/* out of check-all-items loop */
B
Bruce Momjian 已提交
2077

2078
					/* Move to tuple with prior row version */
2079 2080 2081 2082 2083 2084 2085 2086
					vtld.new_tid = tp.t_self;
					vtlp = (VTupleLink)
						vac_bsearch((void *) &vtld,
									(void *) (vacrelstats->vtlinks),
									vacrelstats->num_vtlinks,
									sizeof(VTupleLinkData),
									vac_cmp_vtlinks);
					if (vtlp == NULL)
2087
					{
2088
						/* see discussion above */
2089
						elog(DEBUG2, "parent item in update-chain not found --- cannot continue repair_frag");
2090
						chain_move_failed = true;
B
Bruce Momjian 已提交
2091
						break;	/* out of check-all-items loop */
2092 2093 2094
					}
					tp.t_self = vtlp->this_tid;
					Pbuf = ReadBuffer(onerel,
B
Bruce Momjian 已提交
2095
									ItemPointerGetBlockNumber(&(tp.t_self)));
2096 2097
					Ppage = BufferGetPage(Pbuf);
					Pitemid = PageGetItemId(Ppage,
B
Bruce Momjian 已提交
2098
								   ItemPointerGetOffsetNumber(&(tp.t_self)));
2099 2100
					/* this can't happen since we saw tuple earlier: */
					if (!ItemIdIsUsed(Pitemid))
2101
						elog(ERROR, "parent itemid marked as unused");
2102
					PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
2103 2104 2105

					/* ctid should not have changed since we saved it */
					Assert(ItemPointerEquals(&(vtld.new_tid),
2106
											 &(PTdata->t_ctid)));
B
Bruce Momjian 已提交
2107

2108
					/*
2109
					 * Read above about cases when !ItemIdIsUsed(nextItemid)
B
Bruce Momjian 已提交
2110 2111 2112 2113 2114 2115 2116 2117
					 * (child item is removed)... Due to the fact that at the
					 * moment we don't remove unuseful part of update-chain,
					 * it's possible to get non-matching parent row here. Like
					 * as in the case which caused this problem, we stop
					 * shrinking here. I could try to find real parent row but
					 * want not to do it because of real solution will be
					 * implemented anyway, later, and we are too close to 6.5
					 * release. - vadim 06/11/99
2118
					 */
2119 2120
					if ((PTdata->t_infomask & HEAP_XMAX_IS_MULTI) ||
						!(TransactionIdEquals(HeapTupleHeaderGetXmax(PTdata),
B
Bruce Momjian 已提交
2121
										 HeapTupleHeaderGetXmin(tp.t_data))))
2122 2123
					{
						ReleaseBuffer(Pbuf);
2124
						elog(DEBUG2, "too old parent tuple found --- cannot continue repair_frag");
2125
						chain_move_failed = true;
B
Bruce Momjian 已提交
2126
						break;	/* out of check-all-items loop */
2127
					}
2128
					tp.t_data = PTdata;
2129 2130 2131 2132 2133
					tlen = tp.t_len = ItemIdGetLength(Pitemid);
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					Cbuf = Pbuf;
					freeCbuf = true;
B
Bruce Momjian 已提交
2134
				}				/* end of check-all-items loop */
2135

2136 2137
				if (freeCbuf)
					ReleaseBuffer(Cbuf);
2138 2139
				freeCbuf = false;

2140 2141 2142 2143 2144 2145 2146
				/* Double-check that we will move the current target tuple */
				if (!moved_target && !chain_move_failed)
				{
					elog(DEBUG2, "failed to chain back to target --- cannot continue repair_frag");
					chain_move_failed = true;
				}

2147
				if (chain_move_failed)
2148
				{
2149
					/*
B
Bruce Momjian 已提交
2150 2151 2152
					 * Undo changes to offsets_used state.	We don't bother
					 * cleaning up the amount-free state, since we're not
					 * going to do any further tuple motion.
2153 2154 2155 2156 2157 2158
					 */
					for (i = 0; i < num_vtmove; i++)
					{
						Assert(vtmove[i].vacpage->offsets_used > 0);
						(vtmove[i].vacpage->offsets_used)--;
					}
2159
					pfree(vtmove);
2160
					break;		/* out of walk-along-page loop */
2161
				}
2162

2163
				/*
2164 2165 2166
				 * Okay, move the whole tuple chain in reverse order.
				 *
				 * Ctid tracks the new location of the previously-moved tuple.
2167
				 */
2168 2169 2170
				ItemPointerSetInvalid(&Ctid);
				for (ti = 0; ti < num_vtmove; ti++)
				{
B
Bruce Momjian 已提交
2171
					VacPage		destvacpage = vtmove[ti].vacpage;
B
Bruce Momjian 已提交
2172 2173
					Page		Cpage;
					ItemId		Citemid;
2174

V
Vadim B. Mikheev 已提交
2175
					/* Get page to move from */
2176
					tuple.t_self = vtmove[ti].tid;
B
Bruce Momjian 已提交
2177
					Cbuf = ReadBuffer(onerel,
B
Bruce Momjian 已提交
2178
								 ItemPointerGetBlockNumber(&(tuple.t_self)));
V
Vadim B. Mikheev 已提交
2179 2180

					/* Get page to move to */
B
Bruce Momjian 已提交
2181
					dst_buffer = ReadBuffer(onerel, destvacpage->blkno);
V
Vadim B. Mikheev 已提交
2182

B
Bruce Momjian 已提交
2183 2184
					LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
					if (dst_buffer != Cbuf)
V
Vadim B. Mikheev 已提交
2185 2186
						LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);

B
Bruce Momjian 已提交
2187
					dst_page = BufferGetPage(dst_buffer);
2188
					Cpage = BufferGetPage(Cbuf);
V
Vadim B. Mikheev 已提交
2189

B
Bruce Momjian 已提交
2190
					Citemid = PageGetItemId(Cpage,
B
Bruce Momjian 已提交
2191
								ItemPointerGetOffsetNumber(&(tuple.t_self)));
2192 2193
					tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
					tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
2194

B
Bruce Momjian 已提交
2195 2196 2197
					move_chain_tuple(onerel, Cbuf, Cpage, &tuple,
									 dst_buffer, dst_page, destvacpage,
									 &ec, &Ctid, vtmove[ti].cleanVpd);
V
Vadim B. Mikheev 已提交
2198

B
Bruce Momjian 已提交
2199
					num_moved++;
2200
					if (destvacpage->blkno > last_move_dest_block)
B
Bruce Momjian 已提交
2201
						last_move_dest_block = destvacpage->blkno;
B
Bruce Momjian 已提交
2202

2203 2204 2205 2206
					/*
					 * Remember that we moved tuple from the current page
					 * (corresponding index tuple will be cleaned).
					 */
2207
					if (Cbuf == buf)
B
Bruce Momjian 已提交
2208
						vacpage->offsets[vacpage->offsets_free++] =
B
Bruce Momjian 已提交
2209
							ItemPointerGetOffsetNumber(&(tuple.t_self));
2210 2211
					else
						keep_tuples++;
2212

2213 2214
					ReleaseBuffer(dst_buffer);
					ReleaseBuffer(Cbuf);
B
Bruce Momjian 已提交
2215
				}				/* end of move-the-tuple-chain loop */
2216

B
Bruce Momjian 已提交
2217
				dst_buffer = InvalidBuffer;
2218
				pfree(vtmove);
2219
				chain_tuple_moved = true;
2220 2221

				/* advance to next tuple in walk-along-page loop */
2222
				continue;
B
Bruce Momjian 已提交
2223
			}					/* end of is-tuple-in-chain test */
2224

2225
			/* try to find new page for this tuple */
B
Bruce Momjian 已提交
2226 2227
			if (dst_buffer == InvalidBuffer ||
				!enough_space(dst_vacpage, tuple_len))
2228
			{
B
Bruce Momjian 已提交
2229
				if (dst_buffer != InvalidBuffer)
2230
				{
2231
					ReleaseBuffer(dst_buffer);
B
Bruce Momjian 已提交
2232
					dst_buffer = InvalidBuffer;
2233
				}
B
Bruce Momjian 已提交
2234
				for (i = 0; i < num_fraged_pages; i++)
2235
				{
B
Bruce Momjian 已提交
2236
					if (enough_space(fraged_pages->pagedesc[i], tuple_len))
2237 2238
						break;
				}
B
Bruce Momjian 已提交
2239
				if (i == num_fraged_pages)
2240
					break;		/* can't move item anywhere */
B
Bruce Momjian 已提交
2241 2242 2243 2244
				dst_vacpage = fraged_pages->pagedesc[i];
				dst_buffer = ReadBuffer(onerel, dst_vacpage->blkno);
				LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
				dst_page = BufferGetPage(dst_buffer);
2245
				/* if this page was not used before - clean it */
B
Bruce Momjian 已提交
2246 2247
				if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0)
					vacuum_page(onerel, dst_buffer, dst_vacpage);
2248
			}
V
Vadim B. Mikheev 已提交
2249
			else
B
Bruce Momjian 已提交
2250
				LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
V
Vadim B. Mikheev 已提交
2251 2252

			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2253

B
Bruce Momjian 已提交
2254 2255
			move_plain_tuple(onerel, buf, page, &tuple,
							 dst_buffer, dst_page, dst_vacpage, &ec);
2256

B
Bruce Momjian 已提交
2257 2258 2259
			num_moved++;
			if (dst_vacpage->blkno > last_move_dest_block)
				last_move_dest_block = dst_vacpage->blkno;
2260

B
Bruce Momjian 已提交
2261
			/*
B
Bruce Momjian 已提交
2262 2263
			 * Remember that we moved tuple from the current page
			 * (corresponding index tuple will be cleaned).
2264
			 */
B
Bruce Momjian 已提交
2265
			vacpage->offsets[vacpage->offsets_free++] = offnum;
B
Bruce Momjian 已提交
2266
		}						/* walk along page */
2267

2268
		/*
B
Bruce Momjian 已提交
2269 2270 2271 2272
		 * If we broke out of the walk-along-page loop early (ie, still have
		 * offnum <= maxoff), then we failed to move some tuple off this page.
		 * No point in shrinking any more, so clean up and exit the per-page
		 * loop.
2273
		 */
2274 2275
		if (offnum < maxoff && keep_tuples > 0)
		{
B
Bruce Momjian 已提交
2276
			OffsetNumber off;
2277

2278
			/*
B
Bruce Momjian 已提交
2279
			 * Fix vacpage state for any unvisited tuples remaining on page
2280
			 */
2281
			for (off = OffsetNumberNext(offnum);
B
Bruce Momjian 已提交
2282 2283
				 off <= maxoff;
				 off = OffsetNumberNext(off))
2284
			{
B
Bruce Momjian 已提交
2285 2286
				ItemId		itemid = PageGetItemId(page, off);
				HeapTupleHeader htup;
B
Bruce Momjian 已提交
2287

2288 2289
				if (!ItemIdIsUsed(itemid))
					continue;
B
Bruce Momjian 已提交
2290 2291
				htup = (HeapTupleHeader) PageGetItem(page, itemid);
				if (htup->t_infomask & HEAP_XMIN_COMMITTED)
2292
					continue;
B
Bruce Momjian 已提交
2293

B
Bruce Momjian 已提交
2294
				/*
B
Bruce Momjian 已提交
2295 2296
				 * See comments in the walk-along-page loop above about why
				 * only MOVED_OFF tuples should be found here.
B
Bruce Momjian 已提交
2297
				 */
2298 2299 2300 2301 2302 2303 2304
				if (htup->t_infomask & HEAP_MOVED_IN)
					elog(ERROR, "HEAP_MOVED_IN was not expected");
				if (!(htup->t_infomask & HEAP_MOVED_OFF))
					elog(ERROR, "HEAP_MOVED_OFF was expected");
				if (HeapTupleHeaderGetXvac(htup) != myXID)
					elog(ERROR, "invalid XVAC in tuple header");

B
Bruce Momjian 已提交
2305
				if (chain_tuple_moved)
2306
				{
2307
					/* some chains were moved while cleaning this page */
B
Bruce Momjian 已提交
2308 2309 2310 2311 2312
					Assert(vacpage->offsets_free > 0);
					for (i = 0; i < vacpage->offsets_free; i++)
					{
						if (vacpage->offsets[i] == off)
							break;
2313
					}
B
Bruce Momjian 已提交
2314
					if (i >= vacpage->offsets_free)		/* not found */
2315
					{
B
Bruce Momjian 已提交
2316
						vacpage->offsets[vacpage->offsets_free++] = off;
2317 2318 2319 2320
						Assert(keep_tuples > 0);
						keep_tuples--;
					}
				}
2321
				else
B
Bruce Momjian 已提交
2322 2323 2324 2325 2326
				{
					vacpage->offsets[vacpage->offsets_free++] = off;
					Assert(keep_tuples > 0);
					keep_tuples--;
				}
2327 2328 2329
			}
		}

B
Bruce Momjian 已提交
2330
		if (vacpage->offsets_free > 0)	/* some tuples were moved */
V
Vadim B. Mikheev 已提交
2331
		{
2332 2333
			if (chain_tuple_moved)		/* else - they are ordered */
			{
B
Bruce Momjian 已提交
2334
				qsort((char *) (vacpage->offsets), vacpage->offsets_free,
B
Bruce Momjian 已提交
2335
					  sizeof(OffsetNumber), vac_cmp_offno);
2336
			}
2337
			vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
V
Vadim B. Mikheev 已提交
2338
		}
2339 2340

		ReleaseBuffer(buf);
V
Vadim B. Mikheev 已提交
2341

2342
		if (offnum <= maxoff)
2343
			break;				/* had to quit early, see above note */
2344 2345 2346 2347 2348

	}							/* walk along relation */

	blkno++;					/* new number of blocks */

B
Bruce Momjian 已提交
2349
	if (dst_buffer != InvalidBuffer)
V
Vadim B. Mikheev 已提交
2350
	{
B
Bruce Momjian 已提交
2351
		Assert(num_moved > 0);
2352
		ReleaseBuffer(dst_buffer);
V
Vadim B. Mikheev 已提交
2353
	}
2354

B
Bruce Momjian 已提交
2355
	if (num_moved > 0)
V
Vadim B. Mikheev 已提交
2356
	{
2357
		/*
2358 2359 2360
		 * We have to commit our tuple movings before we truncate the
		 * relation.  Ideally we should do Commit/StartTransactionCommand
		 * here, relying on the session-level table lock to protect our
B
Bruce Momjian 已提交
2361 2362 2363 2364
		 * exclusive access to the relation.  However, that would require a
		 * lot of extra code to close and re-open the relation, indexes, etc.
		 * For now, a quick hack: record status of current transaction as
		 * committed, and continue.
2365
		 */
V
Vadim B. Mikheev 已提交
2366
		RecordTransactionCommit();
V
Vadim B. Mikheev 已提交
2367
	}
2368 2369

	/*
2370
	 * We are not going to move any more tuples across pages, but we still
B
Bruce Momjian 已提交
2371 2372 2373 2374
	 * need to apply vacuum_page to compact free space in the remaining pages
	 * in vacuum_pages list.  Note that some of these pages may also be in the
	 * fraged_pages list, and may have had tuples moved onto them; if so, we
	 * already did vacuum_page and needn't do it again.
2375
	 */
2376 2377 2378
	for (i = 0, curpage = vacuum_pages->pagedesc;
		 i < vacuumed_pages;
		 i++, curpage++)
V
Vadim B. Mikheev 已提交
2379
	{
2380 2381
		vacuum_delay_point();

2382
		Assert((*curpage)->blkno < blkno);
2383
		if ((*curpage)->offsets_used == 0)
2384
		{
B
Bruce Momjian 已提交
2385 2386 2387
			Buffer		buf;
			Page		page;

2388 2389 2390 2391
			/* this page was not used as a move target, so must clean it */
			buf = ReadBuffer(onerel, (*curpage)->blkno);
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			page = BufferGetPage(buf);
2392
			if (!PageIsEmpty(page))
2393
				vacuum_page(onerel, buf, *curpage);
2394
			UnlockReleaseBuffer(buf);
2395
		}
2396 2397 2398
	}

	/*
2399 2400 2401
	 * Now scan all the pages that we moved tuples onto and update tuple
	 * status bits.  This is not really necessary, but will save time for
	 * future transactions examining these tuples.
2402
	 */
B
Bruce Momjian 已提交
2403 2404
	update_hint_bits(onerel, fraged_pages, num_fraged_pages,
					 last_move_dest_block, num_moved);
B
Bruce Momjian 已提交
2405

2406
	/*
B
Bruce Momjian 已提交
2407 2408 2409 2410
	 * It'd be cleaner to make this report at the bottom of this routine, but
	 * then the rusage would double-count the second pass of index vacuuming.
	 * So do it here and ignore the relatively small amount of processing that
	 * occurs below.
2411 2412
	 */
	ereport(elevel,
B
Bruce Momjian 已提交
2413 2414 2415 2416 2417
			(errmsg("\"%s\": moved %u row versions, truncated %u to %u pages",
					RelationGetRelationName(onerel),
					num_moved, nblocks, blkno),
			 errdetail("%s.",
					   pg_rusage_show(&ru0))));
2418

2419 2420 2421
	/*
	 * Reflect the motion of system tuples to catalog cache here.
	 */
2422
	CommandCounterIncrement();
2423

B
Bruce Momjian 已提交
2424
	if (Nvacpagelist.num_pages > 0)
V
Vadim B. Mikheev 已提交
2425
	{
2426
		/* vacuum indexes again if needed */
2427
		if (Irel != NULL)
2428
		{
B
Bruce Momjian 已提交
2429
			VacPage    *vpleft,
2430 2431
					   *vpright,
						vpsave;
2432

B
Bruce Momjian 已提交
2433 2434
			/* re-sort Nvacpagelist.pagedesc */
			for (vpleft = Nvacpagelist.pagedesc,
B
Bruce Momjian 已提交
2435
				 vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
2436 2437 2438 2439 2440 2441
				 vpleft < vpright; vpleft++, vpright--)
			{
				vpsave = *vpleft;
				*vpleft = *vpright;
				*vpright = vpsave;
			}
B
Bruce Momjian 已提交
2442

B
Bruce Momjian 已提交
2443
			/*
B
Bruce Momjian 已提交
2444 2445 2446 2447
			 * keep_tuples is the number of tuples that have been moved off a
			 * page during chain moves but not been scanned over subsequently.
			 * The tuple ids of these tuples are not recorded as free offsets
			 * for any VacPage, so they will not be cleared from the indexes.
B
Bruce Momjian 已提交
2448
			 */
2449
			Assert(keep_tuples >= 0);
2450
			for (i = 0; i < nindexes; i++)
B
Bruce Momjian 已提交
2451
				vacuum_index(&Nvacpagelist, Irel[i],
2452
							 vacrelstats->rel_tuples, keep_tuples);
2453 2454
		}

2455 2456 2457
		/*
		 * Clean moved-off tuples from last page in Nvacpagelist list.
		 *
2458 2459
		 * We need only do this in this one page, because higher-numbered
		 * pages are going to be truncated from the relation entirely. But see
B
Bruce Momjian 已提交
2460
		 * comments for update_hint_bits().
2461
		 */
2462
		if (vacpage->blkno == (blkno - 1) &&
B
Bruce Momjian 已提交
2463
			vacpage->offsets_free > 0)
2464
		{
B
Bruce Momjian 已提交
2465 2466
			Buffer		buf;
			Page		page;
2467
			OffsetNumber unused[MaxOffsetNumber];
B
Bruce Momjian 已提交
2468 2469 2470 2471
			OffsetNumber offnum,
						maxoff;
			int			uncnt;
			int			num_tuples = 0;
2472

B
Bruce Momjian 已提交
2473
			buf = ReadBuffer(onerel, vacpage->blkno);
2474
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2475
			page = BufferGetPage(buf);
2476
			maxoff = PageGetMaxOffsetNumber(page);
2477
			for (offnum = FirstOffsetNumber;
2478
				 offnum <= maxoff;
2479 2480
				 offnum = OffsetNumberNext(offnum))
			{
B
Bruce Momjian 已提交
2481
				ItemId		itemid = PageGetItemId(page, offnum);
B
Bruce Momjian 已提交
2482 2483
				HeapTupleHeader htup;

2484 2485
				if (!ItemIdIsUsed(itemid))
					continue;
B
Bruce Momjian 已提交
2486 2487 2488
				htup = (HeapTupleHeader) PageGetItem(page, itemid);
				if (htup->t_infomask & HEAP_XMIN_COMMITTED)
					continue;
2489

B
Bruce Momjian 已提交
2490
				/*
B
Bruce Momjian 已提交
2491 2492
				 * See comments in the walk-along-page loop above about why
				 * only MOVED_OFF tuples should be found here.
B
Bruce Momjian 已提交
2493
				 */
2494 2495 2496 2497 2498 2499
				if (htup->t_infomask & HEAP_MOVED_IN)
					elog(ERROR, "HEAP_MOVED_IN was not expected");
				if (!(htup->t_infomask & HEAP_MOVED_OFF))
					elog(ERROR, "HEAP_MOVED_OFF was expected");
				if (HeapTupleHeaderGetXvac(htup) != myXID)
					elog(ERROR, "invalid XVAC in tuple header");
B
Bruce Momjian 已提交
2500 2501 2502

				itemid->lp_flags &= ~LP_USED;
				num_tuples++;
2503
			}
B
Bruce Momjian 已提交
2504
			Assert(vacpage->offsets_free == num_tuples);
2505

2506
			START_CRIT_SECTION();
2507

2508
			uncnt = PageRepairFragmentation(page, unused);
2509

2510 2511
			MarkBufferDirty(buf);

2512 2513
			/* XLOG stuff */
			if (!onerel->rd_istemp)
2514
			{
2515
				XLogRecPtr	recptr;
B
Bruce Momjian 已提交
2516

2517
				recptr = log_heap_clean(onerel, buf, unused, uncnt);
2518
				PageSetLSN(page, recptr);
2519
				PageSetTLI(page, ThisTimeLineID);
2520
			}
2521 2522
			else
			{
B
Bruce Momjian 已提交
2523
				/*
B
Bruce Momjian 已提交
2524 2525
				 * No XLOG record, but still need to flag that XID exists on
				 * disk
B
Bruce Momjian 已提交
2526
				 */
2527 2528 2529
				MyXactMadeTempRelUpdate = true;
			}

2530
			END_CRIT_SECTION();
2531

2532
			UnlockReleaseBuffer(buf);
2533 2534
		}

B
Bruce Momjian 已提交
2535
		/* now - free new list of reaped pages */
B
Bruce Momjian 已提交
2536 2537 2538 2539
		curpage = Nvacpagelist.pagedesc;
		for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
			pfree(*curpage);
		pfree(Nvacpagelist.pagedesc);
V
Vadim B. Mikheev 已提交
2540 2541
	}

2542
	/* Truncate relation, if needed */
2543
	if (blkno < nblocks)
V
Vadim B. Mikheev 已提交
2544
	{
2545
		RelationTruncate(onerel, blkno);
2546
		vacrelstats->rel_pages = blkno; /* set new number of blocks */
2547 2548
	}

2549
	/* clean up */
B
Bruce Momjian 已提交
2550
	pfree(vacpage);
2551 2552
	if (vacrelstats->vtlinks != NULL)
		pfree(vacrelstats->vtlinks);
2553

B
Bruce Momjian 已提交
2554 2555
	ExecContext_Finish(&ec);
}
2556

B
Bruce Momjian 已提交
2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576
/*
 *	move_chain_tuple() -- move one tuple that is part of a tuple chain
 *
 *		This routine moves old_tup from old_page to dst_page.
 *		old_page and dst_page might be the same page.
 *		On entry old_buf and dst_buf are locked exclusively, both locks (or
 *		the single lock, if this is a intra-page-move) are released before
 *		exit.
 *
 *		Yes, a routine with ten parameters is ugly, but it's still better
 *		than having these 120 lines of code in repair_frag() which is
 *		already too long and almost unreadable.
 */
static void
move_chain_tuple(Relation rel,
				 Buffer old_buf, Page old_page, HeapTuple old_tup,
				 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
				 ExecContext ec, ItemPointer ctid, bool cleanVpd)
{
	TransactionId myXID = GetCurrentTransactionId();
B
Bruce Momjian 已提交
2577 2578 2579 2580
	HeapTupleData newtup;
	OffsetNumber newoff;
	ItemId		newitemid;
	Size		tuple_len = old_tup->t_len;
B
Bruce Momjian 已提交
2581

2582 2583 2584
	/*
	 * make a modifiable copy of the source tuple.
	 */
B
Bruce Momjian 已提交
2585 2586
	heap_copytuple_with_tuple(old_tup, &newtup);

2587 2588 2589
	/*
	 * register invalidation of source tuple in catcaches.
	 */
B
Bruce Momjian 已提交
2590 2591 2592 2593 2594
	CacheInvalidateHeapTuple(rel, old_tup);

	/* NO EREPORT(ERROR) TILL CHANGES ARE LOGGED */
	START_CRIT_SECTION();

2595 2596 2597
	/*
	 * mark the source tuple MOVED_OFF.
	 */
B
Bruce Momjian 已提交
2598
	old_tup->t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
B
Bruce Momjian 已提交
2599 2600
									 HEAP_XMIN_INVALID |
									 HEAP_MOVED_IN);
B
Bruce Momjian 已提交
2601 2602 2603 2604 2605 2606
	old_tup->t_data->t_infomask |= HEAP_MOVED_OFF;
	HeapTupleHeaderSetXvac(old_tup->t_data, myXID);

	/*
	 * If this page was not used before - clean it.
	 *
B
Bruce Momjian 已提交
2607 2608 2609 2610 2611 2612 2613
	 * NOTE: a nasty bug used to lurk here.  It is possible for the source and
	 * destination pages to be the same (since this tuple-chain member can be
	 * on a page lower than the one we're currently processing in the outer
	 * loop).  If that's true, then after vacuum_page() the source tuple will
	 * have been moved, and tuple.t_data will be pointing at garbage.
	 * Therefore we must do everything that uses old_tup->t_data BEFORE this
	 * step!!
B
Bruce Momjian 已提交
2614
	 *
2615 2616
	 * This path is different from the other callers of vacuum_page, because
	 * we have already incremented the vacpage's offsets_used field to account
B
Bruce Momjian 已提交
2617 2618 2619 2620
	 * for the tuple(s) we expect to move onto the page. Therefore
	 * vacuum_page's check for offsets_used == 0 is wrong. But since that's a
	 * good debugging check for all other callers, we work around it here
	 * rather than remove it.
B
Bruce Momjian 已提交
2621 2622 2623
	 */
	if (!PageIsEmpty(dst_page) && cleanVpd)
	{
B
Bruce Momjian 已提交
2624
		int			sv_offsets_used = dst_vacpage->offsets_used;
B
Bruce Momjian 已提交
2625 2626 2627 2628 2629 2630 2631

		dst_vacpage->offsets_used = 0;
		vacuum_page(rel, dst_buf, dst_vacpage);
		dst_vacpage->offsets_used = sv_offsets_used;
	}

	/*
B
Bruce Momjian 已提交
2632 2633
	 * Update the state of the copied tuple, and store it on the destination
	 * page.
B
Bruce Momjian 已提交
2634 2635 2636 2637 2638 2639 2640 2641 2642 2643
	 */
	newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
								   HEAP_XMIN_INVALID |
								   HEAP_MOVED_OFF);
	newtup.t_data->t_infomask |= HEAP_MOVED_IN;
	HeapTupleHeaderSetXvac(newtup.t_data, myXID);
	newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
						 InvalidOffsetNumber, LP_USED);
	if (newoff == InvalidOffsetNumber)
		elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain",
B
Bruce Momjian 已提交
2644
			 (unsigned long) tuple_len, dst_vacpage->blkno);
B
Bruce Momjian 已提交
2645
	newitemid = PageGetItemId(dst_page, newoff);
2646
	/* drop temporary copy, and point to the version on the dest page */
B
Bruce Momjian 已提交
2647 2648
	pfree(newtup.t_data);
	newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid);
2649

B
Bruce Momjian 已提交
2650 2651
	ItemPointerSet(&(newtup.t_self), dst_vacpage->blkno, newoff);

2652
	/*
B
Bruce Momjian 已提交
2653 2654 2655
	 * Set new tuple's t_ctid pointing to itself if last tuple in chain, and
	 * to next tuple in chain otherwise.  (Since we move the chain in reverse
	 * order, this is actually the previously processed tuple.)
2656 2657 2658 2659 2660 2661 2662
	 */
	if (!ItemPointerIsValid(ctid))
		newtup.t_data->t_ctid = newtup.t_self;
	else
		newtup.t_data->t_ctid = *ctid;
	*ctid = newtup.t_self;

2663 2664 2665 2666
	MarkBufferDirty(dst_buf);
	if (dst_buf != old_buf)
		MarkBufferDirty(old_buf);

B
Bruce Momjian 已提交
2667 2668 2669 2670 2671 2672 2673 2674 2675
	/* XLOG stuff */
	if (!rel->rd_istemp)
	{
		XLogRecPtr	recptr = log_heap_move(rel, old_buf, old_tup->t_self,
										   dst_buf, &newtup);

		if (old_buf != dst_buf)
		{
			PageSetLSN(old_page, recptr);
2676
			PageSetTLI(old_page, ThisTimeLineID);
B
Bruce Momjian 已提交
2677 2678
		}
		PageSetLSN(dst_page, recptr);
2679
		PageSetTLI(dst_page, ThisTimeLineID);
B
Bruce Momjian 已提交
2680 2681 2682
	}
	else
	{
2683 2684 2685
		/*
		 * No XLOG record, but still need to flag that XID exists on disk
		 */
B
Bruce Momjian 已提交
2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699
		MyXactMadeTempRelUpdate = true;
	}

	END_CRIT_SECTION();

	LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
	if (dst_buf != old_buf)
		LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);

	/* Create index entries for the moved tuple */
	if (ec->resultRelInfo->ri_NumIndices > 0)
	{
		ExecStoreTuple(&newtup, ec->slot, InvalidBuffer, false);
		ExecInsertIndexTuples(ec->slot, &(newtup.t_self), ec->estate, true);
2700
		ResetPerTupleExprContext(ec->estate);
B
Bruce Momjian 已提交
2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721
	}
}

/*
 *	move_plain_tuple() -- move one tuple that is not part of a chain
 *
 *		This routine moves old_tup from old_page to dst_page.
 *		On entry old_buf and dst_buf are locked exclusively, both locks are
 *		released before exit.
 *
 *		Yes, a routine with eight parameters is ugly, but it's still better
 *		than having these 90 lines of code in repair_frag() which is already
 *		too long and almost unreadable.
 */
static void
move_plain_tuple(Relation rel,
				 Buffer old_buf, Page old_page, HeapTuple old_tup,
				 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
				 ExecContext ec)
{
	TransactionId myXID = GetCurrentTransactionId();
B
Bruce Momjian 已提交
2722 2723 2724 2725
	HeapTupleData newtup;
	OffsetNumber newoff;
	ItemId		newitemid;
	Size		tuple_len = old_tup->t_len;
B
Bruce Momjian 已提交
2726 2727 2728 2729 2730 2731 2732

	/* copy tuple */
	heap_copytuple_with_tuple(old_tup, &newtup);

	/*
	 * register invalidation of source tuple in catcaches.
	 *
B
Bruce Momjian 已提交
2733
	 * (Note: we do not need to register the copied tuple, because we are not
B
Bruce Momjian 已提交
2734 2735
	 * changing the tuple contents and so there cannot be any need to flush
	 * negative catcache entries.)
B
Bruce Momjian 已提交
2736 2737 2738 2739 2740 2741
	 */
	CacheInvalidateHeapTuple(rel, old_tup);

	/* NO EREPORT(ERROR) TILL CHANGES ARE LOGGED */
	START_CRIT_SECTION();

2742 2743 2744
	/*
	 * Mark new tuple as MOVED_IN by me.
	 */
B
Bruce Momjian 已提交
2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764
	newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
								   HEAP_XMIN_INVALID |
								   HEAP_MOVED_OFF);
	newtup.t_data->t_infomask |= HEAP_MOVED_IN;
	HeapTupleHeaderSetXvac(newtup.t_data, myXID);

	/* add tuple to the page */
	newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len,
						 InvalidOffsetNumber, LP_USED);
	if (newoff == InvalidOffsetNumber)
		elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
			 (unsigned long) tuple_len,
			 dst_vacpage->blkno, (unsigned long) dst_vacpage->free,
			 dst_vacpage->offsets_used, dst_vacpage->offsets_free);
	newitemid = PageGetItemId(dst_page, newoff);
	pfree(newtup.t_data);
	newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid);
	ItemPointerSet(&(newtup.t_data->t_ctid), dst_vacpage->blkno, newoff);
	newtup.t_self = newtup.t_data->t_ctid;

2765 2766 2767
	/*
	 * Mark old tuple as MOVED_OFF by me.
	 */
B
Bruce Momjian 已提交
2768
	old_tup->t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
B
Bruce Momjian 已提交
2769 2770
									 HEAP_XMIN_INVALID |
									 HEAP_MOVED_IN);
B
Bruce Momjian 已提交
2771 2772 2773
	old_tup->t_data->t_infomask |= HEAP_MOVED_OFF;
	HeapTupleHeaderSetXvac(old_tup->t_data, myXID);

2774 2775 2776
	MarkBufferDirty(dst_buf);
	MarkBufferDirty(old_buf);

B
Bruce Momjian 已提交
2777 2778 2779 2780 2781 2782 2783
	/* XLOG stuff */
	if (!rel->rd_istemp)
	{
		XLogRecPtr	recptr = log_heap_move(rel, old_buf, old_tup->t_self,
										   dst_buf, &newtup);

		PageSetLSN(old_page, recptr);
2784
		PageSetTLI(old_page, ThisTimeLineID);
B
Bruce Momjian 已提交
2785
		PageSetLSN(dst_page, recptr);
2786
		PageSetTLI(dst_page, ThisTimeLineID);
B
Bruce Momjian 已提交
2787 2788 2789
	}
	else
	{
2790 2791 2792
		/*
		 * No XLOG record, but still need to flag that XID exists on disk
		 */
B
Bruce Momjian 已提交
2793 2794 2795 2796 2797
		MyXactMadeTempRelUpdate = true;
	}

	END_CRIT_SECTION();

B
Bruce Momjian 已提交
2798
	dst_vacpage->free = PageGetFreeSpaceWithFillFactor(rel, dst_page);
B
Bruce Momjian 已提交
2799 2800 2801 2802 2803 2804 2805 2806 2807 2808
	LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
	LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);

	dst_vacpage->offsets_used++;

	/* insert index' tuples if needed */
	if (ec->resultRelInfo->ri_NumIndices > 0)
	{
		ExecStoreTuple(&newtup, ec->slot, InvalidBuffer, false);
		ExecInsertIndexTuples(ec->slot, &(newtup.t_self), ec->estate, true);
2809
		ResetPerTupleExprContext(ec->estate);
B
Bruce Momjian 已提交
2810 2811 2812 2813 2814 2815
	}
}

/*
 *	update_hint_bits() -- update hint bits in destination pages
 *
2816
 * Scan all the pages that we moved tuples onto and update tuple status bits.
2817 2818
 * This is not really necessary, but it will save time for future transactions
 * examining these tuples.
2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831
 *
 * This pass guarantees that all HEAP_MOVED_IN tuples are marked as
 * XMIN_COMMITTED, so that future tqual tests won't need to check their XVAC.
 *
 * BUT NOTICE that this code fails to clear HEAP_MOVED_OFF tuples from
 * pages that were move source pages but not move dest pages.  The bulk
 * of the move source pages will be physically truncated from the relation,
 * and the last page remaining in the rel will be fixed separately in
 * repair_frag(), so the only cases where a MOVED_OFF tuple won't get its
 * hint bits updated are tuples that are moved as part of a chain and were
 * on pages that were not either move destinations nor at the end of the rel.
 * To completely ensure that no MOVED_OFF tuples remain unmarked, we'd have
 * to remember and revisit those pages too.
B
Bruce Momjian 已提交
2832
 *
2833 2834 2835
 * One wonders whether it wouldn't be better to skip this work entirely,
 * and let the tuple status updates happen someplace that's not holding an
 * exclusive lock on the relation.
B
Bruce Momjian 已提交
2836 2837 2838 2839 2840
 */
static void
update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages,
				 BlockNumber last_move_dest_block, int num_moved)
{
2841
	TransactionId myXID = GetCurrentTransactionId();
B
Bruce Momjian 已提交
2842 2843
	int			checked_moved = 0;
	int			i;
B
Bruce Momjian 已提交
2844
	VacPage    *curpage;
B
Bruce Momjian 已提交
2845 2846 2847 2848 2849

	for (i = 0, curpage = fraged_pages->pagedesc;
		 i < num_fraged_pages;
		 i++, curpage++)
	{
B
Bruce Momjian 已提交
2850 2851 2852 2853 2854
		Buffer		buf;
		Page		page;
		OffsetNumber max_offset;
		OffsetNumber off;
		int			num_tuples = 0;
B
Bruce Momjian 已提交
2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869

		vacuum_delay_point();

		if ((*curpage)->blkno > last_move_dest_block)
			break;				/* no need to scan any further */
		if ((*curpage)->offsets_used == 0)
			continue;			/* this page was never used as a move dest */
		buf = ReadBuffer(rel, (*curpage)->blkno);
		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
		page = BufferGetPage(buf);
		max_offset = PageGetMaxOffsetNumber(page);
		for (off = FirstOffsetNumber;
			 off <= max_offset;
			 off = OffsetNumberNext(off))
		{
B
Bruce Momjian 已提交
2870 2871
			ItemId		itemid = PageGetItemId(page, off);
			HeapTupleHeader htup;
B
Bruce Momjian 已提交
2872 2873 2874 2875 2876 2877

			if (!ItemIdIsUsed(itemid))
				continue;
			htup = (HeapTupleHeader) PageGetItem(page, itemid);
			if (htup->t_infomask & HEAP_XMIN_COMMITTED)
				continue;
B
Bruce Momjian 已提交
2878

B
Bruce Momjian 已提交
2879
			/*
2880
			 * Here we may see either MOVED_OFF or MOVED_IN tuples.
B
Bruce Momjian 已提交
2881
			 */
2882 2883 2884 2885 2886
			if (!(htup->t_infomask & HEAP_MOVED))
				elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
			if (HeapTupleHeaderGetXvac(htup) != myXID)
				elog(ERROR, "invalid XVAC in tuple header");

B
Bruce Momjian 已提交
2887 2888 2889 2890 2891 2892 2893 2894 2895
			if (htup->t_infomask & HEAP_MOVED_IN)
			{
				htup->t_infomask |= HEAP_XMIN_COMMITTED;
				htup->t_infomask &= ~HEAP_MOVED;
				num_tuples++;
			}
			else
				htup->t_infomask |= HEAP_XMIN_INVALID;
		}
2896 2897
		MarkBufferDirty(buf);
		UnlockReleaseBuffer(buf);
B
Bruce Momjian 已提交
2898 2899 2900 2901
		Assert((*curpage)->offsets_used == num_tuples);
		checked_moved += num_tuples;
	}
	Assert(num_moved == checked_moved);
B
Bruce Momjian 已提交
2902
}
V
Vadim B. Mikheev 已提交
2903 2904

/*
B
Bruce Momjian 已提交
2905
 *	vacuum_heap() -- free dead tuples
V
Vadim B. Mikheev 已提交
2906
 *
2907 2908
 *		This routine marks dead tuples as unused and truncates relation
 *		if there are "empty" end-blocks.
V
Vadim B. Mikheev 已提交
2909 2910
 */
static void
B
Bruce Momjian 已提交
2911
vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
V
Vadim B. Mikheev 已提交
2912
{
2913
	Buffer		buf;
B
Bruce Momjian 已提交
2914
	VacPage    *vacpage;
2915
	BlockNumber relblocks;
2916
	int			nblocks;
2917
	int			i;
2918

B
Bruce Momjian 已提交
2919
	nblocks = vacuum_pages->num_pages;
B
Bruce Momjian 已提交
2920
	nblocks -= vacuum_pages->empty_end_pages;	/* nothing to do with them */
2921

B
Bruce Momjian 已提交
2922
	for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
2923
	{
2924 2925
		vacuum_delay_point();

B
Bruce Momjian 已提交
2926
		if ((*vacpage)->offsets_free > 0)
2927
		{
B
Bruce Momjian 已提交
2928
			buf = ReadBuffer(onerel, (*vacpage)->blkno);
2929 2930
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			vacuum_page(onerel, buf, *vacpage);
2931
			UnlockReleaseBuffer(buf);
2932
		}
V
Vadim B. Mikheev 已提交
2933 2934
	}

2935
	/* Truncate relation if there are some empty end-pages */
2936
	Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
B
Bruce Momjian 已提交
2937
	if (vacuum_pages->empty_end_pages > 0)
2938
	{
2939
		relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
2940 2941 2942 2943
		ereport(elevel,
				(errmsg("\"%s\": truncated %u to %u pages",
						RelationGetRelationName(onerel),
						vacrelstats->rel_pages, relblocks)));
2944
		RelationTruncate(onerel, relblocks);
2945
		vacrelstats->rel_pages = relblocks;		/* set new number of blocks */
2946
	}
B
Bruce Momjian 已提交
2947
}
V
Vadim B. Mikheev 已提交
2948 2949

/*
B
Bruce Momjian 已提交
2950
 *	vacuum_page() -- free dead tuples on a page
2951
 *					 and repair its fragmentation.
2952 2953
 *
 * Caller must hold pin and lock on buffer.
V
Vadim B. Mikheev 已提交
2954 2955
 */
static void
2956
vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
V
Vadim B. Mikheev 已提交
2957
{
2958
	OffsetNumber unused[MaxOffsetNumber];
B
Bruce Momjian 已提交
2959 2960 2961 2962
	int			uncnt;
	Page		page = BufferGetPage(buffer);
	ItemId		itemid;
	int			i;
2963

2964
	/* There shouldn't be any tuples moved onto the page yet! */
B
Bruce Momjian 已提交
2965
	Assert(vacpage->offsets_used == 0);
2966

2967
	START_CRIT_SECTION();
2968

B
Bruce Momjian 已提交
2969
	for (i = 0; i < vacpage->offsets_free; i++)
V
Vadim B. Mikheev 已提交
2970
	{
2971
		itemid = PageGetItemId(page, vacpage->offsets[i]);
2972
		itemid->lp_flags &= ~LP_USED;
V
Vadim B. Mikheev 已提交
2973
	}
2974

2975
	uncnt = PageRepairFragmentation(page, unused);
2976

2977 2978
	MarkBufferDirty(buffer);

2979 2980
	/* XLOG stuff */
	if (!onerel->rd_istemp)
2981
	{
2982
		XLogRecPtr	recptr;
B
Bruce Momjian 已提交
2983

2984
		recptr = log_heap_clean(onerel, buffer, unused, uncnt);
2985
		PageSetLSN(page, recptr);
2986
		PageSetTLI(page, ThisTimeLineID);
2987
	}
2988 2989 2990 2991 2992 2993
	else
	{
		/* No XLOG record, but still need to flag that XID exists on disk */
		MyXactMadeTempRelUpdate = true;
	}

2994
	END_CRIT_SECTION();
B
Bruce Momjian 已提交
2995
}
2996

2997
/*
2998
 *	scan_index() -- scan one index relation to update pg_class statistics.
2999 3000
 *
 * We use this when we have no deletions to do.
3001 3002
 */
static void
3003
scan_index(Relation indrel, double num_tuples)
3004
{
3005
	IndexBulkDeleteResult *stats;
3006
	IndexVacuumInfo ivinfo;
3007
	PGRUsage	ru0;
3008

3009
	pg_rusage_init(&ru0);
3010

3011 3012 3013 3014
	ivinfo.index = indrel;
	ivinfo.vacuum_full = true;
	ivinfo.message_level = elevel;
	ivinfo.num_heap_tuples = num_tuples;
3015

3016
	stats = index_vacuum_cleanup(&ivinfo, NULL);
3017

3018 3019
	if (!stats)
		return;
3020

3021
	/* now update statistics in pg_class */
3022
	vac_update_relstats(RelationGetRelid(indrel),
3023
						stats->num_pages, stats->num_index_tuples,
3024
						false, InvalidTransactionId);
3025

3026
	ereport(elevel,
B
Bruce Momjian 已提交
3027 3028
			(errmsg("index \"%s\" now contains %.0f row versions in %u pages",
					RelationGetRelationName(indrel),
3029
					stats->num_index_tuples,
B
Bruce Momjian 已提交
3030 3031 3032 3033 3034
					stats->num_pages),
	errdetail("%u index pages have been deleted, %u are currently reusable.\n"
			  "%s.",
			  stats->pages_deleted, stats->pages_free,
			  pg_rusage_show(&ru0))));
3035

3036
	/*
B
Bruce Momjian 已提交
3037 3038
	 * Check for tuple count mismatch.	If the index is partial, then it's OK
	 * for it to have fewer tuples than the heap; else we got trouble.
3039
	 */
3040
	if (stats->num_index_tuples != num_tuples)
3041
	{
3042
		if (stats->num_index_tuples > num_tuples ||
3043
			!vac_is_partial_index(indrel))
3044
			ereport(WARNING,
3045
					(errmsg("index \"%s\" contains %.0f row versions, but table contains %.0f row versions",
3046 3047 3048
							RelationGetRelationName(indrel),
							stats->num_index_tuples, num_tuples),
					 errhint("Rebuild the index with REINDEX.")));
3049
	}
3050 3051

	pfree(stats);
B
Bruce Momjian 已提交
3052
}
3053

3054
/*
B
Bruce Momjian 已提交
3055
 *	vacuum_index() -- vacuum one index relation.
3056
 *
B
Bruce Momjian 已提交
3057
 *		Vpl is the VacPageList of the heap we're currently vacuuming.
3058
 *		It's locked. Indrel is an index relation on the vacuumed heap.
3059 3060 3061
 *
 *		We don't bother to set locks on the index relation here, since
 *		the parent table is exclusive-locked already.
3062
 *
3063 3064
 *		Finally, we arrange to update the index relation's statistics in
 *		pg_class.
3065 3066
 */
static void
3067
vacuum_index(VacPageList vacpagelist, Relation indrel,
3068
			 double num_tuples, int keep_tuples)
3069
{
3070
	IndexBulkDeleteResult *stats;
3071
	IndexVacuumInfo ivinfo;
3072
	PGRUsage	ru0;
3073

3074
	pg_rusage_init(&ru0);
3075

3076 3077 3078 3079 3080
	ivinfo.index = indrel;
	ivinfo.vacuum_full = true;
	ivinfo.message_level = elevel;
	ivinfo.num_heap_tuples = num_tuples + keep_tuples;

3081
	/* Do bulk deletion */
3082
	stats = index_bulk_delete(&ivinfo, NULL, tid_reaped, (void *) vacpagelist);
3083

3084
	/* Do post-VACUUM cleanup */
3085
	stats = index_vacuum_cleanup(&ivinfo, stats);
3086

3087 3088
	if (!stats)
		return;
3089

3090
	/* now update statistics in pg_class */
3091
	vac_update_relstats(RelationGetRelid(indrel),
3092
						stats->num_pages, stats->num_index_tuples,
3093
						false, InvalidTransactionId);
3094

3095
	ereport(elevel,
B
Bruce Momjian 已提交
3096 3097 3098 3099 3100 3101 3102 3103 3104 3105
			(errmsg("index \"%s\" now contains %.0f row versions in %u pages",
					RelationGetRelationName(indrel),
					stats->num_index_tuples,
					stats->num_pages),
			 errdetail("%.0f index row versions were removed.\n"
			 "%u index pages have been deleted, %u are currently reusable.\n"
					   "%s.",
					   stats->tuples_removed,
					   stats->pages_deleted, stats->pages_free,
					   pg_rusage_show(&ru0))));
3106

3107
	/*
B
Bruce Momjian 已提交
3108 3109
	 * Check for tuple count mismatch.	If the index is partial, then it's OK
	 * for it to have fewer tuples than the heap; else we got trouble.
3110
	 */
3111
	if (stats->num_index_tuples != num_tuples + keep_tuples)
3112
	{
3113
		if (stats->num_index_tuples > num_tuples + keep_tuples ||
3114
			!vac_is_partial_index(indrel))
3115
			ereport(WARNING,
3116
					(errmsg("index \"%s\" contains %.0f row versions, but table contains %.0f row versions",
3117
							RelationGetRelationName(indrel),
B
Bruce Momjian 已提交
3118
						  stats->num_index_tuples, num_tuples + keep_tuples),
3119
					 errhint("Rebuild the index with REINDEX.")));
3120
	}
3121 3122

	pfree(stats);
B
Bruce Momjian 已提交
3123
}
V
Vadim B. Mikheev 已提交
3124 3125

/*
B
Bruce Momjian 已提交
3126
 *	tid_reaped() -- is a particular tid reaped?
V
Vadim B. Mikheev 已提交
3127
 *
3128 3129
 *		This has the right signature to be an IndexBulkDeleteCallback.
 *
B
Bruce Momjian 已提交
3130
 *		vacpagelist->VacPage_array is sorted in right order.
V
Vadim B. Mikheev 已提交
3131
 */
3132 3133
static bool
tid_reaped(ItemPointer itemptr, void *state)
V
Vadim B. Mikheev 已提交
3134
{
3135
	VacPageList vacpagelist = (VacPageList) state;
3136 3137
	OffsetNumber ioffno;
	OffsetNumber *voff;
B
Bruce Momjian 已提交
3138
	VacPage		vp,
3139
			   *vpp;
B
Bruce Momjian 已提交
3140
	VacPageData vacpage;
V
Vadim B. Mikheev 已提交
3141

B
Bruce Momjian 已提交
3142
	vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
3143
	ioffno = ItemPointerGetOffsetNumber(itemptr);
V
Vadim B. Mikheev 已提交
3144

B
Bruce Momjian 已提交
3145
	vp = &vacpage;
3146 3147 3148 3149
	vpp = (VacPage *) vac_bsearch((void *) &vp,
								  (void *) (vacpagelist->pagedesc),
								  vacpagelist->num_pages,
								  sizeof(VacPage),
B
Bruce Momjian 已提交
3150
								  vac_cmp_blk);
V
Vadim B. Mikheev 已提交
3151

3152 3153
	if (vpp == NULL)
		return false;
3154

3155 3156
	/* ok - we are on a partially or fully reaped page */
	vp = *vpp;
3157

B
Bruce Momjian 已提交
3158
	if (vp->offsets_free == 0)
3159 3160
	{
		/* this is EmptyPage, so claim all tuples on it are reaped!!! */
3161
		return true;
3162 3163
	}

3164 3165 3166 3167
	voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
										(void *) (vp->offsets),
										vp->offsets_free,
										sizeof(OffsetNumber),
B
Bruce Momjian 已提交
3168
										vac_cmp_offno);
3169

3170 3171
	if (voff == NULL)
		return false;
3172

3173
	/* tid is reaped */
3174
	return true;
B
Bruce Momjian 已提交
3175
}
3176

3177 3178 3179 3180 3181 3182 3183 3184 3185
/*
 * Update the shared Free Space Map with the info we now have about
 * free space in the relation, discarding any old info the map may have.
 */
static void
vac_update_fsm(Relation onerel, VacPageList fraged_pages,
			   BlockNumber rel_pages)
{
	int			nPages = fraged_pages->num_pages;
3186 3187
	VacPage    *pagedesc = fraged_pages->pagedesc;
	Size		threshold;
3188
	PageFreeSpaceInfo *pageSpaces;
3189 3190 3191 3192 3193
	int			outPages;
	int			i;

	/*
	 * We only report pages with free space at least equal to the average
B
Bruce Momjian 已提交
3194 3195 3196 3197 3198 3199 3200
	 * request size --- this avoids cluttering FSM with uselessly-small bits
	 * of space.  Although FSM would discard pages with little free space
	 * anyway, it's important to do this prefiltering because (a) it reduces
	 * the time spent holding the FSM lock in RecordRelationFreeSpace, and (b)
	 * FSM uses the number of pages reported as a statistic for guiding space
	 * management.	If we didn't threshold our reports the same way
	 * vacuumlazy.c does, we'd be skewing that statistic.
3201 3202
	 */
	threshold = GetAvgFSMRequestSize(&onerel->rd_node);
3203

3204
	pageSpaces = (PageFreeSpaceInfo *)
3205
		palloc(nPages * sizeof(PageFreeSpaceInfo));
3206
	outPages = 0;
3207 3208 3209 3210

	for (i = 0; i < nPages; i++)
	{
		/*
B
Bruce Momjian 已提交
3211 3212 3213
		 * fraged_pages may contain entries for pages that we later decided to
		 * truncate from the relation; don't enter them into the free space
		 * map!
3214
		 */
3215
		if (pagedesc[i]->blkno >= rel_pages)
3216
			break;
3217 3218 3219 3220 3221 3222

		if (pagedesc[i]->free >= threshold)
		{
			pageSpaces[outPages].blkno = pagedesc[i]->blkno;
			pageSpaces[outPages].avail = pagedesc[i]->free;
			outPages++;
3223 3224 3225
		}
	}

3226
	RecordRelationFreeSpace(&onerel->rd_node, outPages, outPages, pageSpaces);
3227 3228

	pfree(pageSpaces);
3229 3230
}

3231 3232 3233
/* Copy a VacPage structure */
static VacPage
copy_vac_page(VacPage vacpage)
3234
{
B
Bruce Momjian 已提交
3235
	VacPage		newvacpage;
3236

B
Bruce Momjian 已提交
3237
	/* allocate a VacPageData entry */
3238
	newvacpage = (VacPage) palloc(sizeof(VacPageData) +
B
Bruce Momjian 已提交
3239
							   vacpage->offsets_free * sizeof(OffsetNumber));
3240

3241
	/* fill it in */
B
Bruce Momjian 已提交
3242
	if (vacpage->offsets_free > 0)
3243 3244
		memcpy(newvacpage->offsets, vacpage->offsets,
			   vacpage->offsets_free * sizeof(OffsetNumber));
B
Bruce Momjian 已提交
3245 3246 3247 3248
	newvacpage->blkno = vacpage->blkno;
	newvacpage->free = vacpage->free;
	newvacpage->offsets_used = vacpage->offsets_used;
	newvacpage->offsets_free = vacpage->offsets_free;
3249

3250
	return newvacpage;
B
Bruce Momjian 已提交
3251
}
V
Vadim B. Mikheev 已提交
3252

3253 3254 3255 3256 3257 3258 3259
/*
 * Add a VacPage pointer to a VacPageList.
 *
 *		As a side effect of the way that scan_heap works,
 *		higher pages come after lower pages in the array
 *		(and highest tid on a page is last).
 */
B
Bruce Momjian 已提交
3260 3261
static void
vpage_insert(VacPageList vacpagelist, VacPage vpnew)
V
Vadim B. Mikheev 已提交
3262
{
T
Tatsuo Ishii 已提交
3263
#define PG_NPAGEDESC 1024
V
Vadim B. Mikheev 已提交
3264

B
Bruce Momjian 已提交
3265 3266
	/* allocate a VacPage entry if needed */
	if (vacpagelist->num_pages == 0)
T
Tatsuo Ishii 已提交
3267
	{
B
Bruce Momjian 已提交
3268 3269
		vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
		vacpagelist->num_allocated_pages = PG_NPAGEDESC;
T
Tatsuo Ishii 已提交
3270
	}
B
Bruce Momjian 已提交
3271
	else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
T
Tatsuo Ishii 已提交
3272
	{
B
Bruce Momjian 已提交
3273 3274
		vacpagelist->num_allocated_pages *= 2;
		vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
T
Tatsuo Ishii 已提交
3275
	}
B
Bruce Momjian 已提交
3276 3277
	vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
	(vacpagelist->num_pages)++;
3278 3279
}

3280 3281 3282
/*
 * vac_bsearch: just like standard C library routine bsearch(),
 * except that we first test to see whether the target key is outside
3283
 * the range of the table entries.	This case is handled relatively slowly
3284 3285 3286
 * by the normal binary search algorithm (ie, no faster than any other key)
 * but it occurs often enough in VACUUM to be worth optimizing.
 */
3287
static void *
3288 3289
vac_bsearch(const void *key, const void *base,
			size_t nelem, size_t size,
B
Bruce Momjian 已提交
3290
			int (*compar) (const void *, const void *))
3291
{
3292
	int			res;
3293 3294 3295 3296 3297 3298 3299 3300 3301 3302
	const void *last;

	if (nelem == 0)
		return NULL;
	res = compar(key, base);
	if (res < 0)
		return NULL;
	if (res == 0)
		return (void *) base;
	if (nelem > 1)
3303
	{
3304 3305 3306
		last = (const void *) ((const char *) base + (nelem - 1) * size);
		res = compar(key, last);
		if (res > 0)
3307
			return NULL;
3308 3309
		if (res == 0)
			return (void *) last;
3310
	}
3311 3312 3313
	if (nelem <= 2)
		return NULL;			/* already checked 'em all */
	return bsearch(key, base, nelem, size, compar);
B
Bruce Momjian 已提交
3314
}
3315

3316 3317 3318
/*
 * Comparator routines for use with qsort() and bsearch().
 */
3319
static int
B
Bruce Momjian 已提交
3320
vac_cmp_blk(const void *left, const void *right)
3321
{
3322 3323
	BlockNumber lblk,
				rblk;
3324

B
Bruce Momjian 已提交
3325 3326
	lblk = (*((VacPage *) left))->blkno;
	rblk = (*((VacPage *) right))->blkno;
3327

3328
	if (lblk < rblk)
3329
		return -1;
3330
	if (lblk == rblk)
3331 3332
		return 0;
	return 1;
B
Bruce Momjian 已提交
3333
}
3334

3335
static int
B
Bruce Momjian 已提交
3336
vac_cmp_offno(const void *left, const void *right)
3337
{
3338
	if (*(OffsetNumber *) left < *(OffsetNumber *) right)
3339
		return -1;
3340
	if (*(OffsetNumber *) left == *(OffsetNumber *) right)
3341 3342
		return 0;
	return 1;
B
Bruce Momjian 已提交
3343
}
V
Vadim B. Mikheev 已提交
3344

3345
static int
B
Bruce Momjian 已提交
3346
vac_cmp_vtlinks(const void *left, const void *right)
3347
{
B
Bruce Momjian 已提交
3348 3349
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
3350
		return -1;
B
Bruce Momjian 已提交
3351 3352
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
3353 3354
		return 1;
	/* bi_hi-es are equal */
B
Bruce Momjian 已提交
3355 3356
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
3357
		return -1;
B
Bruce Momjian 已提交
3358 3359
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
3360 3361
		return 1;
	/* bi_lo-es are equal */
B
Bruce Momjian 已提交
3362 3363
	if (((VTupleLink) left)->new_tid.ip_posid <
		((VTupleLink) right)->new_tid.ip_posid)
3364
		return -1;
B
Bruce Momjian 已提交
3365 3366
	if (((VTupleLink) left)->new_tid.ip_posid >
		((VTupleLink) right)->new_tid.ip_posid)
3367 3368 3369
		return 1;
	return 0;
}
V
Vadim B. Mikheev 已提交
3370

3371

3372 3373 3374 3375 3376
/*
 * Open all the indexes of the given relation, obtaining the specified kind
 * of lock on each.  Return an array of Relation pointers for the indexes
 * into *Irel, and the number of indexes into *nindexes.
 */
3377
void
3378 3379
vac_open_indexes(Relation relation, LOCKMODE lockmode,
				 int *nindexes, Relation **Irel)
V
Vadim B. Mikheev 已提交
3380
{
3381 3382
	List	   *indexoidlist;
	ListCell   *indexoidscan;
3383
	int			i;
V
Vadim B. Mikheev 已提交
3384

3385 3386
	Assert(lockmode != NoLock);

3387
	indexoidlist = RelationGetIndexList(relation);
3388

3389
	*nindexes = list_length(indexoidlist);
3390

3391 3392
	if (*nindexes > 0)
		*Irel = (Relation *) palloc(*nindexes * sizeof(Relation));
3393 3394
	else
		*Irel = NULL;
3395

3396 3397
	i = 0;
	foreach(indexoidscan, indexoidlist)
3398
	{
3399
		Oid			indexoid = lfirst_oid(indexoidscan);
V
Vadim B. Mikheev 已提交
3400

3401
		(*Irel)[i++] = index_open(indexoid, lockmode);
3402 3403
	}

3404
	list_free(indexoidlist);
B
Bruce Momjian 已提交
3405
}
V
Vadim B. Mikheev 已提交
3406

3407
/*
B
Bruce Momjian 已提交
3408
 * Release the resources acquired by vac_open_indexes.	Optionally release
3409 3410
 * the locks (say NoLock to keep 'em).
 */
3411
void
3412
vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode)
V
Vadim B. Mikheev 已提交
3413
{
3414
	if (Irel == NULL)
3415
		return;
V
Vadim B. Mikheev 已提交
3416

3417
	while (nindexes--)
3418 3419 3420
	{
		Relation	ind = Irel[nindexes];

3421
		index_close(ind, lockmode);
3422
	}
3423
	pfree(Irel);
B
Bruce Momjian 已提交
3424
}
V
Vadim B. Mikheev 已提交
3425 3426


3427 3428 3429 3430 3431
/*
 * Is an index partial (ie, could it contain fewer tuples than the heap?)
 */
bool
vac_is_partial_index(Relation indrel)
V
Vadim B. Mikheev 已提交
3432
{
3433
	/*
B
Bruce Momjian 已提交
3434
	 * If the index's AM doesn't support nulls, it's partial for our purposes
3435
	 */
3436
	if (!indrel->rd_am->amindexnulls)
3437 3438 3439
		return true;

	/* Otherwise, look to see if there's a partial-index predicate */
3440 3441 3442 3443
	if (!heap_attisnull(indrel->rd_indextuple, Anum_pg_index_indpred))
		return true;

	return false;
B
Bruce Momjian 已提交
3444
}
3445 3446


3447
static bool
B
Bruce Momjian 已提交
3448
enough_space(VacPage vacpage, Size len)
V
Vadim B. Mikheev 已提交
3449
{
3450
	len = MAXALIGN(len);
3451

B
Bruce Momjian 已提交
3452
	if (len > vacpage->free)
3453
		return false;
3454

3455 3456 3457
	/* if there are free itemid(s) and len <= free_space... */
	if (vacpage->offsets_used < vacpage->offsets_free)
		return true;
3458

3459 3460
	/* noff_used >= noff_free and so we'll have to allocate new itemid */
	if (len + sizeof(ItemIdData) <= vacpage->free)
3461
		return true;
3462

3463
	return false;
B
Bruce Momjian 已提交
3464
}
3465

B
Bruce Momjian 已提交
3466 3467 3468 3469 3470
static Size
PageGetFreeSpaceWithFillFactor(Relation relation, Page page)
{
	PageHeader	pd = (PageHeader) page;
	Size		freespace = pd->pd_upper - pd->pd_lower;
3471
	Size		targetfree;
B
Bruce Momjian 已提交
3472

3473 3474 3475 3476
	targetfree = RelationGetTargetPageFreeSpace(relation,
												HEAP_DEFAULT_FILLFACTOR);
	if (freespace > targetfree)
		return freespace - targetfree;
B
Bruce Momjian 已提交
3477 3478 3479
	else
		return 0;
}
3480

3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496
/*
 * vacuum_delay_point --- check for interrupts and cost-based delay.
 *
 * This should be called in each major loop of VACUUM processing,
 * typically once per page processed.
 */
void
vacuum_delay_point(void)
{
	/* Always check for interrupts */
	CHECK_FOR_INTERRUPTS();

	/* Nap if appropriate */
	if (VacuumCostActive && !InterruptPending &&
		VacuumCostBalance >= VacuumCostLimit)
	{
B
Bruce Momjian 已提交
3497
		int			msec;
3498

3499 3500 3501
		msec = VacuumCostDelay * VacuumCostBalance / VacuumCostLimit;
		if (msec > VacuumCostDelay * 4)
			msec = VacuumCostDelay * 4;
3502 3503 3504 3505 3506 3507 3508 3509 3510

		pg_usleep(msec * 1000L);

		VacuumCostBalance = 0;

		/* Might have gotten an interrupt while sleeping */
		CHECK_FOR_INTERRUPTS();
	}
}