vacuum.c 85.0 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * vacuum.c
4 5 6
 *	  The postgres vacuum cleaner.
 *
 * This file includes the "full" version of VACUUM, as well as control code
7
 * used by all three of full VACUUM, lazy VACUUM, and ANALYZE.	See
8 9
 * vacuumlazy.c and analyze.c for the rest of the code for the latter two.
 *
10
 *
B
Bruce Momjian 已提交
11
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
12
 * Portions Copyright (c) 1994, Regents of the University of California
13 14 15
 *
 *
 * IDENTIFICATION
16
 *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.235 2002/08/30 22:18:05 tgl Exp $
17 18 19
 *
 *-------------------------------------------------------------------------
 */
20 21
#include "postgres.h"

22
#include <unistd.h>
23

24
#include "access/clog.h"
B
Bruce Momjian 已提交
25 26
#include "access/genam.h"
#include "access/heapam.h"
27
#include "access/xlog.h"
B
Bruce Momjian 已提交
28 29
#include "catalog/catalog.h"
#include "catalog/catname.h"
30
#include "catalog/namespace.h"
31
#include "catalog/pg_database.h"
32
#include "catalog/pg_index.h"
B
Bruce Momjian 已提交
33
#include "commands/vacuum.h"
34
#include "executor/executor.h"
B
Bruce Momjian 已提交
35
#include "miscadmin.h"
36
#include "storage/freespace.h"
37
#include "storage/sinval.h"
B
Bruce Momjian 已提交
38
#include "storage/smgr.h"
39
#include "tcop/pquery.h"
40
#include "utils/acl.h"
B
Bruce Momjian 已提交
41
#include "utils/builtins.h"
42
#include "utils/fmgroids.h"
B
Bruce Momjian 已提交
43
#include "utils/inval.h"
44
#include "utils/lsyscache.h"
45
#include "utils/relcache.h"
B
Bruce Momjian 已提交
46
#include "utils/syscache.h"
47 48
#include "pgstat.h"

49 50 51 52 53 54 55

typedef struct VacPageData
{
	BlockNumber blkno;			/* BlockNumber of this Page */
	Size		free;			/* FreeSpace on this Page */
	uint16		offsets_used;	/* Number of OffNums used by vacuum */
	uint16		offsets_free;	/* Number of OffNums free or to be free */
56
	OffsetNumber offsets[1];	/* Array of free OffNums */
57 58 59 60 61 62
} VacPageData;

typedef VacPageData *VacPage;

typedef struct VacPageListData
{
63
	BlockNumber empty_end_pages;	/* Number of "empty" end-pages */
64
	int			num_pages;		/* Number of pages in pagedesc */
65 66
	int			num_allocated_pages;	/* Number of allocated pages in
										 * pagedesc */
67
	VacPage    *pagedesc;		/* Descriptions of pages */
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
} VacPageListData;

typedef VacPageListData *VacPageList;

typedef struct VTupleLinkData
{
	ItemPointerData new_tid;
	ItemPointerData this_tid;
} VTupleLinkData;

typedef VTupleLinkData *VTupleLink;

typedef struct VTupleMoveData
{
	ItemPointerData tid;		/* tuple ID */
	VacPage		vacpage;		/* where to move */
	bool		cleanVpd;		/* clean vacpage before using */
} VTupleMoveData;

typedef VTupleMoveData *VTupleMove;

typedef struct VRelStats
{
91
	BlockNumber rel_pages;
92
	double		rel_tuples;
93 94 95 96 97 98 99 100
	Size		min_tlen;
	Size		max_tlen;
	bool		hasindex;
	int			num_vtlinks;
	VTupleLink	vtlinks;
} VRelStats;


101
static MemoryContext vac_context = NULL;
102

103
static int elevel = -1;
V
Vadim B. Mikheev 已提交
104

105 106 107 108 109
static TransactionId OldestXmin;
static TransactionId FreezeLimit;

static TransactionId initialOldestXmin;
static TransactionId initialFreezeLimit;
110

111

112
/* non-export function prototypes */
113
static List *getrels(const RangeVar *vacrel, const char *stmttype);
114
static void vac_update_dbstats(Oid dbid,
115 116
				   TransactionId vacuumXID,
				   TransactionId frozenXID);
117
static void vac_truncate_clog(TransactionId vacuumXID,
118
				  TransactionId frozenXID);
119
static void vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind);
120
static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
121
static void scan_heap(VRelStats *vacrelstats, Relation onerel,
122
		  VacPageList vacuum_pages, VacPageList fraged_pages);
123
static void repair_frag(VRelStats *vacrelstats, Relation onerel,
124 125
			VacPageList vacuum_pages, VacPageList fraged_pages,
			int nindexes, Relation *Irel);
126
static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
127
			VacPageList vacpagelist);
128
static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
129
static void vacuum_index(VacPageList vacpagelist, Relation indrel,
130
			 double num_tuples, int keep_tuples);
131
static void scan_index(Relation indrel, double num_tuples);
132
static bool tid_reaped(ItemPointer itemptr, void *state);
133
static bool dummy_tid_reaped(ItemPointer itemptr, void *state);
134
static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
135
			   BlockNumber rel_pages);
136
static VacPage copy_vac_page(VacPage vacpage);
B
Bruce Momjian 已提交
137
static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
138
static void *vac_bsearch(const void *key, const void *base,
139 140
			size_t nelem, size_t size,
			int (*compar) (const void *, const void *));
B
Bruce Momjian 已提交
141 142 143
static int	vac_cmp_blk(const void *left, const void *right);
static int	vac_cmp_offno(const void *left, const void *right);
static int	vac_cmp_vtlinks(const void *left, const void *right);
B
Bruce Momjian 已提交
144
static bool enough_space(VacPage vacpage, Size len);
145 146 147 148 149 150 151 152


/****************************************************************************
 *																			*
 *			Code common to all flavors of VACUUM and ANALYZE				*
 *																			*
 ****************************************************************************
 */
153

154

155 156 157
/*
 * Primary entry point for VACUUM and ANALYZE commands.
 */
158
void
159
vacuum(VacuumStmt *vacstmt)
160
{
161
	const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
162
	MemoryContext anl_context = NULL;
163 164 165 166 167 168 169
	List	   *vrl,
			   *cur;

	if (vacstmt->verbose)
		elevel = INFO;
	else
		elevel = DEBUG1;
170

171
	/*
172
	 * We cannot run VACUUM inside a user transaction block; if we were
B
Bruce Momjian 已提交
173 174 175 176 177 178
	 * inside a transaction, then our commit- and
	 * start-transaction-command calls would not have the intended effect!
	 * Furthermore, the forced commit that occurs before truncating the
	 * relation's file would have the effect of committing the rest of the
	 * user's transaction too, which would certainly not be the desired
	 * behavior.
179
	 */
180
	if (vacstmt->vacuum && IsTransactionBlock())
181
		elog(ERROR, "%s cannot run inside a BEGIN/END block", stmttype);
182

183
	/* Running VACUUM from a function would free the function context */
184
	if (vacstmt->vacuum && !MemoryContextContains(QueryContext, vacstmt))
185
		elog(ERROR, "%s cannot be executed from a function", stmttype);
186

187 188 189
	/*
	 * Send info about dead objects to the statistics collector
	 */
190 191
	if (vacstmt->vacuum)
		pgstat_vacuum_tabstat();
192

193 194 195
	/*
	 * Create special memory context for cross-transaction storage.
	 *
B
Bruce Momjian 已提交
196 197 198
	 * Since it is a child of QueryContext, it will go away eventually even
	 * if we suffer an error; there's no need for special abort cleanup
	 * logic.
199 200 201 202 203 204
	 */
	vac_context = AllocSetContextCreate(QueryContext,
										"Vacuum",
										ALLOCSET_DEFAULT_MINSIZE,
										ALLOCSET_DEFAULT_INITSIZE,
										ALLOCSET_DEFAULT_MAXSIZE);
205

206 207 208 209
	/*
	 * If we are running only ANALYZE, we don't need per-table transactions,
	 * but we still need a memory context with table lifetime.
	 */
210 211 212 213 214 215 216
	if (vacstmt->analyze && !vacstmt->vacuum)
		anl_context = AllocSetContextCreate(QueryContext,
											"Analyze",
											ALLOCSET_DEFAULT_MINSIZE,
											ALLOCSET_DEFAULT_INITSIZE,
											ALLOCSET_DEFAULT_MAXSIZE);

217
	/* Build list of relations to process (note this lives in vac_context) */
218
	vrl = getrels(vacstmt->relation, stmttype);
219

220
	/*
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
	 * Formerly, there was code here to prevent more than one VACUUM from
	 * executing concurrently in the same database.  However, there's no
	 * good reason to prevent that, and manually removing lockfiles after
	 * a vacuum crash was a pain for dbadmins.  So, forget about lockfiles,
	 * and just rely on the locks we grab on each target table
	 * to ensure that there aren't two VACUUMs running on the same table
	 * at the same time.
	 */

	/*
	 * The strangeness with committing and starting transactions here is due
	 * to wanting to run each table's VACUUM as a separate transaction, so
	 * that we don't hold locks unnecessarily long.  Also, if we are doing
	 * VACUUM ANALYZE, the ANALYZE part runs as a separate transaction from
	 * the VACUUM to further reduce locking.
	 *
	 * vacuum_rel expects to be entered with no transaction active; it will
	 * start and commit its own transaction.  But we are called by an SQL
	 * command, and so we are executing inside a transaction already.  We
	 * commit the transaction started in PostgresMain() here, and start
	 * another one before exiting to match the commit waiting for us back in
	 * PostgresMain().
243
	 *
244 245 246
	 * In the case of an ANALYZE statement (no vacuum, just analyze) it's
	 * okay to run the whole thing in the outer transaction, and so we skip
	 * transaction start/stop operations.
247
	 */
248 249 250 251 252
	if (vacstmt->vacuum)
	{
		if (vacstmt->relation == NULL)
		{
			/*
253 254
			 * It's a database-wide VACUUM.
			 *
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
			 * Compute the initially applicable OldestXmin and FreezeLimit
			 * XIDs, so that we can record these values at the end of the
			 * VACUUM. Note that individual tables may well be processed with
			 * newer values, but we can guarantee that no (non-shared)
			 * relations are processed with older ones.
			 *
			 * It is okay to record non-shared values in pg_database, even though
			 * we may vacuum shared relations with older cutoffs, because only
			 * the minimum of the values present in pg_database matters.  We
			 * can be sure that shared relations have at some time been
			 * vacuumed with cutoffs no worse than the global minimum; for, if
			 * there is a backend in some other DB with xmin = OLDXMIN that's
			 * determining the cutoff with which we vacuum shared relations,
			 * it is not possible for that database to have a cutoff newer
			 * than OLDXMIN recorded in pg_database.
			 */
			vacuum_set_xid_limits(vacstmt, false,
								  &initialOldestXmin, &initialFreezeLimit);
		}

		/* matches the StartTransaction in PostgresMain() */
276
		CommitTransactionCommand(true);
277
	}
278

279
	/*
280
	 * Loop to process each selected relation.
281
	 */
282
	foreach(cur, vrl)
283
	{
284 285
		Oid		relid = (Oid) lfirsti(cur);

286
		if (vacstmt->vacuum)
287
			vacuum_rel(relid, vacstmt, RELKIND_RELATION);
288
		if (vacstmt->analyze)
289
		{
290 291 292 293 294 295 296 297
			MemoryContext old_context = NULL;

			/*
			 * If we vacuumed, use new transaction for analyze.  Otherwise,
			 * we can use the outer transaction, but we still need to call
			 * analyze_rel in a memory context that will be cleaned up on
			 * return (else we leak memory while processing multiple tables).
			 */
298
			if (vacstmt->vacuum)
299
				StartTransactionCommand(true);
300 301 302
			else
				old_context = MemoryContextSwitchTo(anl_context);

303
			analyze_rel(relid, vacstmt);
304 305

			if (vacstmt->vacuum)
306
				CommitTransactionCommand(true);
307 308 309
			else
			{
				MemoryContextSwitchTo(old_context);
310
				MemoryContextResetAndDeleteChildren(anl_context);
311 312
			}
		}
313
	}
314

315 316 317
	/*
	 * Finish up processing.
	 */
318
	if (vacstmt->vacuum)
319
	{
320
		/* here, we are not in a transaction */
321

322 323 324 325 326 327
		/*
		 * This matches the CommitTransaction waiting for us in PostgresMain().
		 * We tell xact.c not to chain the upcoming commit, so that a VACUUM
		 * doesn't start a transaction block, even when autocommit is off.
		 */
		StartTransactionCommand(true);
328

329 330 331 332 333 334 335 336 337 338 339
		/*
		 * If we did a database-wide VACUUM, update the database's pg_database
		 * row with info about the transaction IDs used, and try to truncate
		 * pg_clog.
		 */
		if (vacstmt->relation == NULL)
		{
			vac_update_dbstats(MyDatabaseId,
							   initialOldestXmin, initialFreezeLimit);
			vac_truncate_clog(initialOldestXmin, initialFreezeLimit);
		}
340 341
	}

342 343
	/*
	 * Clean up working storage --- note we must do this after
B
Bruce Momjian 已提交
344 345
	 * StartTransactionCommand, else we might be trying to delete the
	 * active context!
346 347 348
	 */
	MemoryContextDelete(vac_context);
	vac_context = NULL;
349

350
	if (anl_context)
351
		MemoryContextDelete(anl_context);
352 353 354
}

/*
355
 * Build a list of Oids for each relation to be processed
356 357 358
 *
 * The list is built in vac_context so that it will survive across our
 * per-relation transactions.
359
 */
360 361
static List *
getrels(const RangeVar *vacrel, const char *stmttype)
362
{
363 364 365 366
	List	   *vrl = NIL;
	MemoryContext oldcontext;

	if (vacrel)
367
	{
368 369 370 371 372 373 374 375 376
		/* Process specific relation */
		Oid		relid;

		relid = RangeVarGetRelid(vacrel, false);

		/* Make a relation list entry for this guy */
		oldcontext = MemoryContextSwitchTo(vac_context);
		vrl = lappendi(vrl, relid);
		MemoryContextSwitchTo(oldcontext);
377 378 379
	}
	else
	{
380 381 382 383 384
		/* Process all plain relations listed in pg_class */
		Relation	pgclass;
		HeapScanDesc scan;
		HeapTuple	tuple;
		ScanKeyData key;
385

386 387 388 389
		ScanKeyEntryInitialize(&key, 0x0,
							   Anum_pg_class_relkind,
							   F_CHAREQ,
							   CharGetDatum(RELKIND_RELATION));
390

391
		pgclass = heap_openr(RelationRelationName, AccessShareLock);
392

393
		scan = heap_beginscan(pgclass, SnapshotNow, 1, &key);
394

395
		while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
396
		{
397 398
			/* Make a relation list entry for this guy */
			oldcontext = MemoryContextSwitchTo(vac_context);
399 400
			AssertTupleDescHasOid(pgclass->rd_att);
			vrl = lappendi(vrl, HeapTupleGetOid(tuple));
401
			MemoryContextSwitchTo(oldcontext);
402
		}
403

404 405
		heap_endscan(scan);
		heap_close(pgclass, AccessShareLock);
406 407
	}

408
	return vrl;
409 410
}

411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
/*
 * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points
 */
void
vacuum_set_xid_limits(VacuumStmt *vacstmt, bool sharedRel,
					  TransactionId *oldestXmin,
					  TransactionId *freezeLimit)
{
	TransactionId limit;

	*oldestXmin = GetOldestXmin(sharedRel);

	Assert(TransactionIdIsNormal(*oldestXmin));

	if (vacstmt->freeze)
	{
		/* FREEZE option: use oldest Xmin as freeze cutoff too */
		limit = *oldestXmin;
	}
	else
	{
		/*
		 * Normal case: freeze cutoff is well in the past, to wit, about
		 * halfway to the wrap horizon
		 */
		limit = GetCurrentTransactionId() - (MaxTransactionId >> 2);
	}

	/*
	 * Be careful not to generate a "permanent" XID
	 */
	if (!TransactionIdIsNormal(limit))
		limit = FirstNormalTransactionId;

	/*
	 * Ensure sane relationship of limits
	 */
	if (TransactionIdFollows(limit, *oldestXmin))
	{
B
Bruce Momjian 已提交
450
		elog(WARNING, "oldest Xmin is far in the past --- close open transactions soon to avoid wraparound problems");
451 452 453 454 455 456
		limit = *oldestXmin;
	}

	*freezeLimit = limit;
}

457

458
/*
459
 *	vac_update_relstats() -- update statistics for one relation
460
 *
461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
 *		Update the whole-relation statistics that are kept in its pg_class
 *		row.  There are additional stats that will be updated if we are
 *		doing ANALYZE, but we always update these stats.  This routine works
 *		for both index and heap relation entries in pg_class.
 *
 *		We violate no-overwrite semantics here by storing new values for the
 *		statistics columns directly into the pg_class tuple that's already on
 *		the page.  The reason for this is that if we updated these tuples in
 *		the usual way, vacuuming pg_class itself wouldn't work very well ---
 *		by the time we got done with a vacuum cycle, most of the tuples in
 *		pg_class would've been obsoleted.  Of course, this only works for
 *		fixed-size never-null columns, but these are.
 *
 *		This routine is shared by full VACUUM, lazy VACUUM, and stand-alone
 *		ANALYZE.
 */
void
vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
					bool hasindex)
{
	Relation	rd;
	HeapTupleData rtup;
	HeapTuple	ctup;
	Form_pg_class pgcform;
	Buffer		buffer;

	/*
	 * update number of tuples and number of pages in pg_class
	 */
	rd = heap_openr(RelationRelationName, RowExclusiveLock);

	ctup = SearchSysCache(RELOID,
						  ObjectIdGetDatum(relid),
						  0, 0, 0);
	if (!HeapTupleIsValid(ctup))
		elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
			 relid);

	/* get the buffer cache tuple */
	rtup.t_self = ctup->t_self;
	ReleaseSysCache(ctup);
502 503 504
	if (!heap_fetch(rd, SnapshotNow, &rtup, &buffer, false, NULL))
		elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
			 relid);
505 506 507 508 509 510

	/* overwrite the existing statistics in the tuple */
	pgcform = (Form_pg_class) GETSTRUCT(&rtup);
	pgcform->relpages = (int32) num_pages;
	pgcform->reltuples = num_tuples;
	pgcform->relhasindex = hasindex;
511

512
	/*
513 514
	 * If we have discovered that there are no indexes, then there's no
	 * primary key either.	This could be done more thoroughly...
515 516 517
	 */
	if (!hasindex)
		pgcform->relhaspkey = false;
518

519 520 521 522 523 524
	/*
	 * Invalidate the tuple in the catcaches; this also arranges to flush
	 * the relation's relcache entry.  (If we fail to commit for some reason,
	 * no flush will occur, but no great harm is done since there are no
	 * noncritical state updates here.)
	 */
525
	CacheInvalidateHeapTuple(rd, &rtup);
526 527

	/* Write the buffer */
528 529 530 531 532 533
	WriteBuffer(buffer);

	heap_close(rd, RowExclusiveLock);
}


534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566
/*
 *	vac_update_dbstats() -- update statistics for one database
 *
 *		Update the whole-database statistics that are kept in its pg_database
 *		row.
 *
 *		We violate no-overwrite semantics here by storing new values for the
 *		statistics columns directly into the tuple that's already on the page.
 *		As with vac_update_relstats, this avoids leaving dead tuples behind
 *		after a VACUUM; which is good since GetRawDatabaseInfo
 *		can get confused by finding dead tuples in pg_database.
 *
 *		This routine is shared by full and lazy VACUUM.  Note that it is only
 *		applied after a database-wide VACUUM operation.
 */
static void
vac_update_dbstats(Oid dbid,
				   TransactionId vacuumXID,
				   TransactionId frozenXID)
{
	Relation	relation;
	ScanKeyData entry[1];
	HeapScanDesc scan;
	HeapTuple	tuple;
	Form_pg_database dbform;

	relation = heap_openr(DatabaseRelationName, RowExclusiveLock);

	/* Must use a heap scan, since there's no syscache for pg_database */
	ScanKeyEntryInitialize(&entry[0], 0x0,
						   ObjectIdAttributeNumber, F_OIDEQ,
						   ObjectIdGetDatum(dbid));

567
	scan = heap_beginscan(relation, SnapshotNow, 1, entry);
568

569
	tuple = heap_getnext(scan, ForwardScanDirection);
570 571 572 573 574 575 576 577 578 579 580

	if (!HeapTupleIsValid(tuple))
		elog(ERROR, "database %u does not exist", dbid);

	dbform = (Form_pg_database) GETSTRUCT(tuple);

	/* overwrite the existing statistics in the tuple */
	dbform->datvacuumxid = vacuumXID;
	dbform->datfrozenxid = frozenXID;

	/* invalidate the tuple in the cache and write the buffer */
581
	CacheInvalidateHeapTuple(relation, tuple);
582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
	WriteNoReleaseBuffer(scan->rs_cbuf);

	heap_endscan(scan);

	heap_close(relation, RowExclusiveLock);
}


/*
 *	vac_truncate_clog() -- attempt to truncate the commit log
 *
 *		Scan pg_database to determine the system-wide oldest datvacuumxid,
 *		and use it to truncate the transaction commit log (pg_clog).
 *		Also generate a warning if the system-wide oldest datfrozenxid
 *		seems to be in danger of wrapping around.
 *
 *		The passed XIDs are simply the ones I just wrote into my pg_database
599
 *		entry.	They're used to initialize the "min" calculations.
600 601 602 603 604 605 606
 *
 *		This routine is shared by full and lazy VACUUM.  Note that it is only
 *		applied after a database-wide VACUUM operation.
 */
static void
vac_truncate_clog(TransactionId vacuumXID, TransactionId frozenXID)
{
607
	TransactionId myXID;
608 609 610 611
	Relation	relation;
	HeapScanDesc scan;
	HeapTuple	tuple;
	int32		age;
612 613 614 615
	bool		vacuumAlreadyWrapped = false;
	bool		frozenAlreadyWrapped = false;

	myXID = GetCurrentTransactionId();
616 617 618

	relation = heap_openr(DatabaseRelationName, AccessShareLock);

619
	scan = heap_beginscan(relation, SnapshotNow, 0, NULL);
620

621
	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
622 623 624 625 626 627 628 629
	{
		Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);

		/* Ignore non-connectable databases (eg, template0) */
		/* It's assumed that these have been frozen correctly */
		if (!dbform->datallowconn)
			continue;

630 631 632 633 634 635 636 637 638 639 640 641 642 643
		if (TransactionIdIsNormal(dbform->datvacuumxid))
		{
			if (TransactionIdPrecedes(myXID, dbform->datvacuumxid))
				vacuumAlreadyWrapped = true;
			else if (TransactionIdPrecedes(dbform->datvacuumxid, vacuumXID))
				vacuumXID = dbform->datvacuumxid;
		}
		if (TransactionIdIsNormal(dbform->datfrozenxid))
		{
			if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
				frozenAlreadyWrapped = true;
			else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
				frozenXID = dbform->datfrozenxid;
		}
644 645 646 647 648 649
	}

	heap_endscan(scan);

	heap_close(relation, AccessShareLock);

650 651 652 653 654 655 656 657 658 659 660
	/*
	 * Do not truncate CLOG if we seem to have suffered wraparound already;
	 * the computed minimum XID might be bogus.
	 */
	if (vacuumAlreadyWrapped)
	{
		elog(WARNING, "Some databases have not been vacuumed in over 2 billion transactions."
			 "\n\tYou may have already suffered transaction-wraparound data loss.");
		return;
	}

661 662 663 664
	/* Truncate CLOG to the oldest vacuumxid */
	TruncateCLOG(vacuumXID);

	/* Give warning about impending wraparound problems */
665 666 667 668 669 670 671 672 673 674 675 676 677 678
	if (frozenAlreadyWrapped)
	{
		elog(WARNING, "Some databases have not been vacuumed in over 1 billion transactions."
			 "\n\tBetter vacuum them soon, or you may have a wraparound failure.");
	}
	else
	{
		age = (int32) (myXID - frozenXID);
		if (age > (int32) ((MaxTransactionId >> 3) * 3))
			elog(WARNING, "Some databases have not been vacuumed in %d transactions."
				 "\n\tBetter vacuum them within %d transactions,"
				 "\n\tor you may have a wraparound failure.",
				 age, (int32) (MaxTransactionId >> 1) - age);
	}
679 680 681
}


682 683 684 685 686 687 688 689 690 691
/****************************************************************************
 *																			*
 *			Code common to both flavors of VACUUM							*
 *																			*
 ****************************************************************************
 */


/*
 *	vacuum_rel() -- vacuum one heap relation
692
 *
693 694 695 696 697
 *		Doing one heap at a time incurs extra overhead, since we need to
 *		check that the heap exists again just before we vacuum it.	The
 *		reason that we do this is so that vacuuming can be spread across
 *		many small transactions.  Otherwise, two-phase locking would require
 *		us to lock the entire database during one pass of the vacuum cleaner.
698 699
 *
 *		At entry and exit, we are not inside a transaction.
700 701
 */
static void
702
vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
703
{
704
	LOCKMODE	lmode;
705
	Relation	onerel;
706
	LockRelId	onerelid;
707
	Oid			toast_relid;
708

709
	/* Begin a transaction for vacuuming this relation */
710
	StartTransactionCommand(true);
711

712
	/*
B
Bruce Momjian 已提交
713
	 * Check for user-requested abort.	Note we want this to be inside a
B
Bruce Momjian 已提交
714
	 * transaction, so xact.c doesn't issue useless WARNING.
715
	 */
716
	CHECK_FOR_INTERRUPTS();
717

718 719 720 721
	/*
	 * Race condition -- if the pg_class tuple has gone away since the
	 * last time we saw it, we don't need to vacuum it.
	 */
722 723 724
	if (!SearchSysCacheExists(RELOID,
							  ObjectIdGetDatum(relid),
							  0, 0, 0))
725
	{
726
		CommitTransactionCommand(true);
727 728 729
		return;
	}

730
	/*
731 732
	 * Determine the type of lock we want --- hard exclusive lock for a
	 * FULL vacuum, but just ShareUpdateExclusiveLock for concurrent
733 734
	 * vacuum.	Either way, we can be sure that no other backend is
	 * vacuuming the same table.
735 736 737 738
	 */
	lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock;

	/*
739 740
	 * Open the class, get an appropriate lock on it, and check
	 * permissions.
741
	 *
742 743
	 * We allow the user to vacuum a table if he is superuser, the table
	 * owner, or the database owner (but in the latter case, only if it's
744
	 * not a shared relation).	pg_class_ownercheck includes the superuser case.
745
	 *
B
Bruce Momjian 已提交
746
	 * Note we choose to treat permissions failure as a WARNING and keep
747
	 * trying to vacuum the rest of the DB --- is this appropriate?
748
	 */
749
	onerel = relation_open(relid, lmode);
750

751
	if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) ||
752
		  (is_dbadmin(MyDatabaseId) && !onerel->rd_rel->relisshared)))
753
	{
B
Bruce Momjian 已提交
754
		elog(WARNING, "Skipping \"%s\" --- only table or database owner can VACUUM it",
755
			 RelationGetRelationName(onerel));
756
		relation_close(onerel, lmode);
757
		CommitTransactionCommand(true);
758 759 760 761 762 763 764 765 766 767 768 769
		return;
	}

	/*
	 * Check that it's a plain table; we used to do this in getrels() but
	 * seems safer to check after we've locked the relation.
	 */
	if (onerel->rd_rel->relkind != expected_relkind)
	{
		elog(WARNING, "Skipping \"%s\" --- can not process indexes, views or special system tables",
			 RelationGetRelationName(onerel));
		relation_close(onerel, lmode);
770
		CommitTransactionCommand(true);
771 772 773
		return;
	}

774
	/*
775 776 777 778
	 * Get a session-level lock too. This will protect our access to the
	 * relation across multiple transactions, so that we can vacuum the
	 * relation's TOAST table (if any) secure in the knowledge that no one
	 * is deleting the parent relation.
779 780 781 782 783 784
	 *
	 * NOTE: this cannot block, even if someone else is waiting for access,
	 * because the lock manager knows that both lock requests are from the
	 * same process.
	 */
	onerelid = onerel->rd_lockInfo.lockRelId;
785
	LockRelationForSession(&onerelid, lmode);
786 787 788

	/*
	 * Remember the relation's TOAST relation for later
789 790 791
	 */
	toast_relid = onerel->rd_rel->reltoastrelid;

792 793 794 795
	/*
	 * Do the actual work --- either FULL or "lazy" vacuum
	 */
	if (vacstmt->full)
796
		full_vacuum_rel(onerel, vacstmt);
797
	else
798
		lazy_vacuum_rel(onerel, vacstmt);
799 800

	/* all done with this class, but hold lock until commit */
801
	relation_close(onerel, NoLock);
802 803 804 805

	/*
	 * Complete the transaction and free all temporary memory used.
	 */
806
	CommitTransactionCommand(true);
807 808 809 810

	/*
	 * If the relation has a secondary toast rel, vacuum that too while we
	 * still hold the session lock on the master table.  Note however that
811 812 813
	 * "analyze" will not get done on the toast table.	This is good,
	 * because the toaster always uses hardcoded index access and
	 * statistics are totally unimportant for toast relations.
814 815
	 */
	if (toast_relid != InvalidOid)
816
		vacuum_rel(toast_relid, vacstmt, RELKIND_TOASTVALUE);
817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835

	/*
	 * Now release the session-level lock on the master table.
	 */
	UnlockRelationForSession(&onerelid, lmode);
}


/****************************************************************************
 *																			*
 *			Code for VACUUM FULL (only)										*
 *																			*
 ****************************************************************************
 */


/*
 *	full_vacuum_rel() -- perform FULL VACUUM for one heap relation
 *
836
 *		This routine vacuums a single heap, cleans out its indexes, and
837 838 839 840 841 842
 *		updates its num_pages and num_tuples statistics.
 *
 *		At entry, we have already established a transaction and opened
 *		and locked the relation.
 */
static void
843
full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
844 845
{
	VacPageListData vacuum_pages;		/* List of pages to vacuum and/or
846
										 * clean indexes */
847 848 849
	VacPageListData fraged_pages;		/* List of pages with space enough
										 * for re-using */
	Relation   *Irel;
850
	int			nindexes,
851 852 853 854 855
				i;
	VRelStats  *vacrelstats;
	bool		reindex = false;

	if (IsIgnoringSystemIndexes() &&
856
		IsSystemRelation(onerel))
857 858
		reindex = true;

859 860
	vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
						  &OldestXmin, &FreezeLimit);
861

862 863 864
	/*
	 * Set up statistics-gathering machinery.
	 */
865
	vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
866 867
	vacrelstats->rel_pages = 0;
	vacrelstats->rel_tuples = 0;
868
	vacrelstats->hasindex = false;
B
Bruce Momjian 已提交
869

870
	/* scan the heap */
B
Bruce Momjian 已提交
871
	vacuum_pages.num_pages = fraged_pages.num_pages = 0;
B
Bruce Momjian 已提交
872
	scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
873

874 875
	/* Now open all indexes of the relation */
	vac_open_indexes(onerel, &nindexes, &Irel);
H
Hiroshi Inoue 已提交
876 877 878 879
	if (!Irel)
		reindex = false;
	else if (!RelationGetForm(onerel)->relhasindex)
		reindex = true;
880
	if (nindexes > 0)
881
		vacrelstats->hasindex = true;
B
Bruce Momjian 已提交
882

883
#ifdef NOT_USED
884

885
	/*
B
Bruce Momjian 已提交
886 887
	 * reindex in VACUUM is dangerous under WAL. ifdef out until it
	 * becomes safe.
888
	 */
H
Hiroshi Inoue 已提交
889 890
	if (reindex)
	{
891
		vac_close_indexes(nindexes, Irel);
H
Hiroshi Inoue 已提交
892
		Irel = (Relation *) NULL;
893
		activate_indexes_of_a_table(RelationGetRelid(onerel), false);
H
Hiroshi Inoue 已提交
894
	}
895
#endif   /* NOT_USED */
896 897 898 899

	/* Clean/scan index relation(s) */
	if (Irel != (Relation *) NULL)
	{
B
Bruce Momjian 已提交
900
		if (vacuum_pages.num_pages > 0)
901
		{
902
			for (i = 0; i < nindexes; i++)
903
				vacuum_index(&vacuum_pages, Irel[i],
904
							 vacrelstats->rel_tuples, 0);
905 906 907
		}
		else
		{
908 909
			/* just scan indexes to update statistic */
			for (i = 0; i < nindexes; i++)
910
				scan_index(Irel[i], vacrelstats->rel_tuples);
911 912 913
		}
	}

914 915 916 917
	if (fraged_pages.num_pages > 0)
	{
		/* Try to shrink heap */
		repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
918 919
					nindexes, Irel);
		vac_close_indexes(nindexes, Irel);
920
	}
921 922
	else
	{
923
		vac_close_indexes(nindexes, Irel);
924 925 926
		if (vacuum_pages.num_pages > 0)
		{
			/* Clean pages from vacuum_pages list */
B
Bruce Momjian 已提交
927
			vacuum_heap(vacrelstats, onerel, &vacuum_pages);
928 929 930 931 932 933 934 935 936
		}
		else
		{
			/*
			 * Flush dirty pages out to disk.  We must do this even if we
			 * didn't do anything else, because we want to ensure that all
			 * tuples have correct on-row commit status on disk (see
			 * bufmgr.c's comments for FlushRelationBuffers()).
			 */
937
			i = FlushRelationBuffers(onerel, vacrelstats->rel_pages);
938
			if (i < 0)
939
				elog(ERROR, "VACUUM (full_vacuum_rel): FlushRelationBuffers returned %d",
940 941
					 i);
		}
942
	}
943

B
Bruce Momjian 已提交
944
#ifdef NOT_USED
H
Hiroshi Inoue 已提交
945
	if (reindex)
946
		activate_indexes_of_a_table(RelationGetRelid(onerel), true);
947
#endif   /* NOT_USED */
948

949 950 951
	/* update shared free space map with final free space info */
	vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);

952
	/* update statistics in pg_class */
953
	vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
954
						vacrelstats->rel_tuples, vacrelstats->hasindex);
955 956
}

957

958
/*
B
Bruce Momjian 已提交
959
 *	scan_heap() -- scan an open heap relation
960
 *
961 962 963 964
 *		This routine sets commit status bits, constructs vacuum_pages (list
 *		of pages we need to compact free space on and/or clean indexes of
 *		deleted tuples), constructs fraged_pages (list of pages with free
 *		space that tuples could be moved into), and calculates statistics
965
 *		on the number of live tuples in the heap.
966 967
 */
static void
B
Bruce Momjian 已提交
968
scan_heap(VRelStats *vacrelstats, Relation onerel,
B
Bruce Momjian 已提交
969
		  VacPageList vacuum_pages, VacPageList fraged_pages)
970
{
971
	BlockNumber nblocks,
972 973 974
				blkno;
	ItemId		itemid;
	Buffer		buf;
B
Bruce Momjian 已提交
975
	HeapTupleData tuple;
976 977 978 979 980 981
	OffsetNumber offnum,
				maxoff;
	bool		pgchanged,
				tupgone,
				notup;
	char	   *relname;
B
Bruce Momjian 已提交
982
	VacPage		vacpage,
983
				vacpagecopy;
984
	BlockNumber empty_pages,
B
Bruce Momjian 已提交
985 986
				new_pages,
				changed_pages,
B
Bruce Momjian 已提交
987
				empty_end_pages;
988 989 990
	double		num_tuples,
				tups_vacuumed,
				nkeep,
991
				nunused;
992
	double		free_size,
B
Bruce Momjian 已提交
993
				usable_free_size;
994
	Size		min_tlen = MaxTupleSize;
995
	Size		max_tlen = 0;
996
	int			i;
997
	bool		do_shrinking = true;
998 999 1000
	VTupleLink	vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
	int			num_vtlinks = 0;
	int			free_vtlinks = 100;
1001
	VacRUsage	ru0;
1002

1003
	vac_init_rusage(&ru0);
1004

1005
	relname = RelationGetRelationName(onerel);
1006 1007 1008
	elog(elevel, "--Relation %s.%s--",
		 get_namespace_name(RelationGetNamespace(onerel)),
		 relname);
1009

1010
	empty_pages = new_pages = changed_pages = empty_end_pages = 0;
1011
	num_tuples = tups_vacuumed = nkeep = nunused = 0;
1012
	free_size = 0;
1013 1014 1015

	nblocks = RelationGetNumberOfBlocks(onerel);

1016 1017 1018 1019
	/*
	 * We initially create each VacPage item in a maximal-sized workspace,
	 * then copy the workspace into a just-large-enough copy.
	 */
B
Bruce Momjian 已提交
1020
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1021 1022 1023

	for (blkno = 0; blkno < nblocks; blkno++)
	{
1024 1025 1026 1027 1028
		Page		page,
					tempPage = NULL;
		bool		do_reap,
					do_frag;

1029 1030
		CHECK_FOR_INTERRUPTS();

1031 1032
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);
1033

B
Bruce Momjian 已提交
1034
		vacpage->blkno = blkno;
1035
		vacpage->offsets_used = 0;
B
Bruce Momjian 已提交
1036
		vacpage->offsets_free = 0;
1037

1038 1039
		if (PageIsNew(page))
		{
B
Bruce Momjian 已提交
1040
			elog(WARNING, "Rel %s: Uninitialized page %u - fixing",
1041 1042
				 relname, blkno);
			PageInit(page, BufferGetPageSize(buf), 0);
B
Bruce Momjian 已提交
1043 1044
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
			free_size += (vacpage->free - sizeof(ItemIdData));
B
Bruce Momjian 已提交
1045
			new_pages++;
B
Bruce Momjian 已提交
1046
			empty_end_pages++;
1047 1048 1049
			vacpagecopy = copy_vac_page(vacpage);
			vpage_insert(vacuum_pages, vacpagecopy);
			vpage_insert(fraged_pages, vacpagecopy);
1050 1051
			WriteBuffer(buf);
			continue;
V
Vadim B. Mikheev 已提交
1052
		}
1053 1054

		if (PageIsEmpty(page))
1055
		{
B
Bruce Momjian 已提交
1056 1057
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
			free_size += (vacpage->free - sizeof(ItemIdData));
B
Bruce Momjian 已提交
1058
			empty_pages++;
B
Bruce Momjian 已提交
1059
			empty_end_pages++;
1060 1061 1062
			vacpagecopy = copy_vac_page(vacpage);
			vpage_insert(vacuum_pages, vacpagecopy);
			vpage_insert(fraged_pages, vacpagecopy);
1063 1064
			ReleaseBuffer(buf);
			continue;
1065 1066
		}

1067 1068 1069 1070 1071 1072
		pgchanged = false;
		notup = true;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
1073
		{
1074 1075
			uint16		sv_infomask;

1076 1077 1078
			itemid = PageGetItemId(page, offnum);

			/*
1079
			 * Collect un-used items too - it's possible to have indexes
1080 1081 1082 1083
			 * pointing here after crash.
			 */
			if (!ItemIdIsUsed(itemid))
			{
B
Bruce Momjian 已提交
1084
				vacpage->offsets[vacpage->offsets_free++] = offnum;
1085
				nunused += 1;
1086 1087 1088
				continue;
			}

1089
			tuple.t_datamcxt = NULL;
1090 1091 1092
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
1093

1094 1095
			tupgone = false;
			sv_infomask = tuple.t_data->t_infomask;
1096

1097
			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin))
1098
			{
1099
				case HEAPTUPLE_DEAD:
1100
					tupgone = true;		/* we can delete the tuple */
1101 1102
					break;
				case HEAPTUPLE_LIVE:
1103

1104
					/*
1105 1106
					 * Tuple is good.  Consider whether to replace its
					 * xmin value with FrozenTransactionId.
1107
					 */
1108 1109
					if (TransactionIdIsNormal(HeapTupleHeaderGetXmin(tuple.t_data)) &&
						TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
1110 1111
											  FreezeLimit))
					{
1112
						HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
T
Tom Lane 已提交
1113 1114
						/* infomask should be okay already */
						Assert(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED);
1115 1116
						pgchanged = true;
					}
1117 1118
					break;
				case HEAPTUPLE_RECENTLY_DEAD:
1119

1120
					/*
1121 1122
					 * If tuple is recently deleted then we must not
					 * remove it from relation.
1123
					 */
1124
					nkeep += 1;
1125

1126 1127 1128 1129 1130
					/*
					 * If we do shrinking and this tuple is updated one
					 * then remember it to construct updated tuple
					 * dependencies.
					 */
1131 1132 1133
					if (do_shrinking &&
						!(ItemPointerEquals(&(tuple.t_self),
											&(tuple.t_data->t_ctid))))
1134 1135 1136 1137
					{
						if (free_vtlinks == 0)
						{
							free_vtlinks = 1000;
B
Bruce Momjian 已提交
1138 1139 1140
							vtlinks = (VTupleLink) repalloc(vtlinks,
										   (free_vtlinks + num_vtlinks) *
												 sizeof(VTupleLinkData));
1141 1142 1143 1144 1145 1146
						}
						vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
						vtlinks[num_vtlinks].this_tid = tuple.t_self;
						free_vtlinks--;
						num_vtlinks++;
					}
1147 1148
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:
1149

1150
					/*
1151 1152
					 * This should not happen, since we hold exclusive
					 * lock on the relation; shouldn't we raise an error?
1153
					 */
B
Bruce Momjian 已提交
1154
					elog(WARNING, "Rel %s: TID %u/%u: InsertTransactionInProgress %u - can't shrink relation",
1155
						 relname, blkno, offnum, HeapTupleHeaderGetXmin(tuple.t_data));
1156 1157 1158
					do_shrinking = false;
					break;
				case HEAPTUPLE_DELETE_IN_PROGRESS:
1159

1160
					/*
1161 1162
					 * This should not happen, since we hold exclusive
					 * lock on the relation; shouldn't we raise an error?
1163
					 */
B
Bruce Momjian 已提交
1164
					elog(WARNING, "Rel %s: TID %u/%u: DeleteTransactionInProgress %u - can't shrink relation",
1165
						 relname, blkno, offnum, HeapTupleHeaderGetXmax(tuple.t_data));
1166 1167 1168 1169 1170
					do_shrinking = false;
					break;
				default:
					elog(ERROR, "Unexpected HeapTupleSatisfiesVacuum result");
					break;
1171 1172
			}

1173 1174 1175 1176
			/* check for hint-bit update by HeapTupleSatisfiesVacuum */
			if (sv_infomask != tuple.t_data->t_infomask)
				pgchanged = true;

1177 1178 1179
			/*
			 * Other checks...
			 */
1180 1181
			if (onerel->rd_rel->relhasoids &&
				!OidIsValid(HeapTupleGetOid(&tuple)))
B
Bruce Momjian 已提交
1182
				elog(WARNING, "Rel %s: TID %u/%u: OID IS INVALID. TUPGONE %d.",
1183
					 relname, blkno, offnum, (int) tupgone);
1184 1185 1186

			if (tupgone)
			{
1187
				ItemId		lpp;
1188

1189 1190 1191 1192 1193
				/*
				 * Here we are building a temporary copy of the page with
				 * dead tuples removed.  Below we will apply
				 * PageRepairFragmentation to the copy, so that we can
				 * determine how much space will be available after
B
Bruce Momjian 已提交
1194
				 * removal of dead tuples.	But note we are NOT changing
1195 1196
				 * the real page yet...
				 */
1197 1198
				if (tempPage == (Page) NULL)
				{
1199
					Size		pageSize;
1200 1201 1202

					pageSize = PageGetPageSize(page);
					tempPage = (Page) palloc(pageSize);
1203
					memcpy(tempPage, page, pageSize);
1204 1205
				}

1206
				/* mark it unused on the temp page */
1207
				lpp = PageGetItemId(tempPage, offnum);
1208 1209
				lpp->lp_flags &= ~LP_USED;

B
Bruce Momjian 已提交
1210
				vacpage->offsets[vacpage->offsets_free++] = offnum;
1211
				tups_vacuumed += 1;
1212 1213 1214
			}
			else
			{
1215
				num_tuples += 1;
1216
				notup = false;
1217 1218 1219 1220
				if (tuple.t_len < min_tlen)
					min_tlen = tuple.t_len;
				if (tuple.t_len > max_tlen)
					max_tlen = tuple.t_len;
1221
			}
1222
		}						/* scan along page */
1223

B
Bruce Momjian 已提交
1224
		if (tempPage != (Page) NULL)
1225 1226
		{
			/* Some tuples are removable; figure free space after removal */
1227
			PageRepairFragmentation(tempPage, NULL);
B
Bruce Momjian 已提交
1228
			vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
1229
			pfree(tempPage);
1230
			do_reap = true;
1231
		}
1232 1233 1234
		else
		{
			/* Just use current available space */
B
Bruce Momjian 已提交
1235
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1236 1237
			/* Need to reap the page if it has ~LP_USED line pointers */
			do_reap = (vacpage->offsets_free > 0);
V
Vadim B. Mikheev 已提交
1238
		}
1239

1240
		free_size += vacpage->free;
1241

1242 1243
		/*
		 * Add the page to fraged_pages if it has a useful amount of free
1244 1245 1246
		 * space.  "Useful" means enough for a minimal-sized tuple. But we
		 * don't know that accurately near the start of the relation, so
		 * add pages unconditionally if they have >= BLCKSZ/10 free space.
1247
		 */
1248
		do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10);
1249 1250

		if (do_reap || do_frag)
1251
		{
1252 1253 1254 1255 1256
			vacpagecopy = copy_vac_page(vacpage);
			if (do_reap)
				vpage_insert(vacuum_pages, vacpagecopy);
			if (do_frag)
				vpage_insert(fraged_pages, vacpagecopy);
1257 1258
		}

1259
		if (notup)
B
Bruce Momjian 已提交
1260
			empty_end_pages++;
1261
		else
B
Bruce Momjian 已提交
1262
			empty_end_pages = 0;
1263 1264 1265 1266 1267 1268 1269 1270

		if (pgchanged)
		{
			WriteBuffer(buf);
			changed_pages++;
		}
		else
			ReleaseBuffer(buf);
1271 1272
	}

B
Bruce Momjian 已提交
1273
	pfree(vacpage);
1274 1275

	/* save stats in the rel list for use later */
1276 1277
	vacrelstats->rel_tuples = num_tuples;
	vacrelstats->rel_pages = nblocks;
B
Bruce Momjian 已提交
1278
	if (num_tuples == 0)
1279 1280 1281 1282
		min_tlen = max_tlen = 0;
	vacrelstats->min_tlen = min_tlen;
	vacrelstats->max_tlen = max_tlen;

B
Bruce Momjian 已提交
1283 1284
	vacuum_pages->empty_end_pages = empty_end_pages;
	fraged_pages->empty_end_pages = empty_end_pages;
1285 1286

	/*
1287 1288 1289
	 * Clear the fraged_pages list if we found we couldn't shrink. Else,
	 * remove any "empty" end-pages from the list, and compute usable free
	 * space = free space in remaining pages.
1290
	 */
1291
	if (do_shrinking)
1292
	{
1293 1294 1295 1296 1297 1298 1299 1300 1301 1302
		Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
		fraged_pages->num_pages -= empty_end_pages;
		usable_free_size = 0;
		for (i = 0; i < fraged_pages->num_pages; i++)
			usable_free_size += fraged_pages->pagedesc[i]->free;
	}
	else
	{
		fraged_pages->num_pages = 0;
		usable_free_size = 0;
V
Vadim B. Mikheev 已提交
1303
	}
1304

1305 1306
	/* don't bother to save vtlinks if we will not call repair_frag */
	if (fraged_pages->num_pages > 0 && num_vtlinks > 0)
1307
	{
B
Bruce Momjian 已提交
1308
		qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
B
Bruce Momjian 已提交
1309
			  vac_cmp_vtlinks);
1310 1311 1312 1313 1314 1315 1316 1317 1318 1319
		vacrelstats->vtlinks = vtlinks;
		vacrelstats->num_vtlinks = num_vtlinks;
	}
	else
	{
		vacrelstats->vtlinks = NULL;
		vacrelstats->num_vtlinks = 0;
		pfree(vtlinks);
	}

1320
	elog(elevel, "Pages %u: Changed %u, reaped %u, Empty %u, New %u; \
1321
Tup %.0f: Vac %.0f, Keep/VTL %.0f/%u, UnUsed %.0f, MinLen %lu, MaxLen %lu; \
1322
Re-using: Free/Avail. Space %.0f/%.0f; EndEmpty/Avail. Pages %u/%u.\n\t%s",
B
Bruce Momjian 已提交
1323
		 nblocks, changed_pages, vacuum_pages->num_pages, empty_pages,
B
Bruce Momjian 已提交
1324
		 new_pages, num_tuples, tups_vacuumed,
1325
		 nkeep, vacrelstats->num_vtlinks,
B
Bruce Momjian 已提交
1326
		 nunused, (unsigned long) min_tlen, (unsigned long) max_tlen,
1327
		 free_size, usable_free_size,
B
Bruce Momjian 已提交
1328
		 empty_end_pages, fraged_pages->num_pages,
1329
		 vac_show_rusage(&ru0));
B
Bruce Momjian 已提交
1330
}
V
Vadim B. Mikheev 已提交
1331

1332 1333

/*
B
Bruce Momjian 已提交
1334
 *	repair_frag() -- try to repair relation's fragmentation
1335
 *
1336
 *		This routine marks dead tuples as unused and tries re-use dead space
1337 1338
 *		by moving tuples (and inserting indexes if needed). It constructs
 *		Nvacpagelist list of free-ed pages (moved tuples) and clean indexes
1339 1340 1341
 *		for them after committing (in hack-manner - without losing locks
 *		and freeing memory!) current transaction. It truncates relation
 *		if some end-blocks are gone away.
1342 1343
 */
static void
B
Bruce Momjian 已提交
1344
repair_frag(VRelStats *vacrelstats, Relation onerel,
B
Bruce Momjian 已提交
1345
			VacPageList vacuum_pages, VacPageList fraged_pages,
1346
			int nindexes, Relation *Irel)
1347
{
1348 1349 1350
	TransactionId myXID;
	CommandId	myCID;
	Buffer		buf,
B
Bruce Momjian 已提交
1351
				cur_buffer;
1352
	BlockNumber nblocks,
1353
				blkno;
1354
	BlockNumber last_move_dest_block = 0,
1355
				last_vacuum_block;
1356 1357
	Page		page,
				ToPage = NULL;
1358 1359
	OffsetNumber offnum,
				maxoff,
1360
				newoff,
B
Bruce Momjian 已提交
1361
				max_offset;
1362 1363
	ItemId		itemid,
				newitemid;
B
Bruce Momjian 已提交
1364 1365
	HeapTupleData tuple,
				newtup;
1366
	TupleDesc	tupdesc;
1367 1368 1369 1370
	ResultRelInfo *resultRelInfo;
	EState	   *estate;
	TupleTable	tupleTable;
	TupleTableSlot *slot;
B
Bruce Momjian 已提交
1371 1372
	VacPageListData Nvacpagelist;
	VacPage		cur_page = NULL,
B
Bruce Momjian 已提交
1373
				last_vacuum_page,
B
Bruce Momjian 已提交
1374 1375
				vacpage,
			   *curpage;
B
Bruce Momjian 已提交
1376
	int			cur_item = 0;
1377
	int			i;
B
Bruce Momjian 已提交
1378 1379 1380 1381
	Size		tuple_len;
	int			num_moved,
				num_fraged_pages,
				vacuumed_pages;
B
Bruce Momjian 已提交
1382
	int			checked_moved,
1383 1384
				num_tuples,
				keep_tuples = 0;
1385
	bool		isempty,
1386 1387
				dowrite,
				chain_tuple_moved;
1388
	VacRUsage	ru0;
1389

1390
	vac_init_rusage(&ru0);
1391 1392 1393 1394

	myXID = GetCurrentTransactionId();
	myCID = GetCurrentCommandId();

1395 1396
	tupdesc = RelationGetDescr(onerel);

1397 1398 1399 1400 1401 1402 1403
	/*
	 * We need a ResultRelInfo and an EState so we can use the regular
	 * executor's index-entry-making machinery.
	 */
	resultRelInfo = makeNode(ResultRelInfo);
	resultRelInfo->ri_RangeTableIndex = 1;		/* dummy */
	resultRelInfo->ri_RelationDesc = onerel;
1404
	resultRelInfo->ri_TrigDesc = NULL;	/* we don't fire triggers */
1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416

	ExecOpenIndices(resultRelInfo);

	estate = CreateExecutorState();
	estate->es_result_relations = resultRelInfo;
	estate->es_num_result_relations = 1;
	estate->es_result_relation_info = resultRelInfo;

	/* Set up a dummy tuple table too */
	tupleTable = ExecCreateTupleTable(1);
	slot = ExecAllocTableSlot(tupleTable);
	ExecSetSlotDescriptor(slot, tupdesc, false);
1417

B
Bruce Momjian 已提交
1418 1419
	Nvacpagelist.num_pages = 0;
	num_fraged_pages = fraged_pages->num_pages;
1420
	Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
B
Bruce Momjian 已提交
1421
	vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432
	if (vacuumed_pages > 0)
	{
		/* get last reaped page from vacuum_pages */
		last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
		last_vacuum_block = last_vacuum_page->blkno;
	}
	else
	{
		last_vacuum_page = NULL;
		last_vacuum_block = InvalidBlockNumber;
	}
B
Bruce Momjian 已提交
1433 1434
	cur_buffer = InvalidBuffer;
	num_moved = 0;
1435

B
Bruce Momjian 已提交
1436 1437
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
	vacpage->offsets_used = vacpage->offsets_free = 0;
1438

1439 1440
	/*
	 * Scan pages backwards from the last nonempty page, trying to move
B
Bruce Momjian 已提交
1441
	 * tuples down to lower pages.	Quit when we reach a page that we have
1442 1443 1444
	 * moved any tuples onto, or the first page if we haven't moved
	 * anything, or when we find a page we cannot completely empty (this
	 * last condition is handled by "break" statements within the loop).
1445
	 *
B
Bruce Momjian 已提交
1446
	 * NB: this code depends on the vacuum_pages and fraged_pages lists being
1447
	 * in order by blkno.
1448
	 */
1449
	nblocks = vacrelstats->rel_pages;
B
Bruce Momjian 已提交
1450
	for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1451 1452
		 blkno > last_move_dest_block;
		 blkno--)
V
Vadim B. Mikheev 已提交
1453
	{
1454 1455
		CHECK_FOR_INTERRUPTS();

1456
		/*
1457 1458 1459 1460 1461
		 * Forget fraged_pages pages at or after this one; they're no
		 * longer useful as move targets, since we only want to move down.
		 * Note that since we stop the outer loop at last_move_dest_block,
		 * pages removed here cannot have had anything moved onto them
		 * already.
1462
		 *
1463 1464 1465
		 * Also note that we don't change the stored fraged_pages list, only
		 * our local variable num_fraged_pages; so the forgotten pages are
		 * still available to be loaded into the free space map later.
1466 1467
		 */
		while (num_fraged_pages > 0 &&
1468
			fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno)
1469
		{
1470
			Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0);
1471 1472 1473 1474 1475 1476
			--num_fraged_pages;
		}

		/*
		 * Process this page of relation.
		 */
1477 1478 1479
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);

B
Bruce Momjian 已提交
1480
		vacpage->offsets_free = 0;
1481 1482 1483 1484

		isempty = PageIsEmpty(page);

		dowrite = false;
1485 1486 1487

		/* Is the page in the vacuum_pages list? */
		if (blkno == last_vacuum_block)
1488
		{
1489 1490 1491
			if (last_vacuum_page->offsets_free > 0)
			{
				/* there are dead tuples on this page - clean them */
1492
				Assert(!isempty);
1493 1494 1495
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				vacuum_page(onerel, buf, last_vacuum_page);
				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1496 1497 1498 1499
				dowrite = true;
			}
			else
				Assert(isempty);
B
Bruce Momjian 已提交
1500
			--vacuumed_pages;
1501 1502
			if (vacuumed_pages > 0)
			{
B
Bruce Momjian 已提交
1503
				/* get prev reaped page from vacuum_pages */
B
Bruce Momjian 已提交
1504 1505
				last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
				last_vacuum_block = last_vacuum_page->blkno;
1506 1507
			}
			else
1508
			{
1509
				last_vacuum_page = NULL;
1510
				last_vacuum_block = InvalidBlockNumber;
1511
			}
1512 1513 1514 1515 1516 1517 1518 1519 1520
			if (isempty)
			{
				ReleaseBuffer(buf);
				continue;
			}
		}
		else
			Assert(!isempty);

B
Bruce Momjian 已提交
1521 1522
		chain_tuple_moved = false;		/* no one chain-tuple was moved
										 * off this page, yet */
B
Bruce Momjian 已提交
1523
		vacpage->blkno = blkno;
1524 1525 1526 1527 1528 1529 1530 1531 1532 1533
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			itemid = PageGetItemId(page, offnum);

			if (!ItemIdIsUsed(itemid))
				continue;

1534
			tuple.t_datamcxt = NULL;
1535 1536 1537
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple_len = tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
1538

1539 1540 1541
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
			{
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1542
					elog(ERROR, "HEAP_MOVED_IN was not expected");
B
Bruce Momjian 已提交
1543 1544 1545

				/*
				 * If this (chain) tuple is moved by me already then I
B
Bruce Momjian 已提交
1546 1547
				 * have to check is it in vacpage or not - i.e. is it
				 * moved while cleaning this page or some previous one.
1548 1549 1550
				 */
				if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
1551 1552
					if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
						elog(ERROR, "Invalid XVAC in tuple header");
1553 1554
					if (keep_tuples == 0)
						continue;
B
Bruce Momjian 已提交
1555 1556 1557
					if (chain_tuple_moved)		/* some chains was moved
												 * while */
					{			/* cleaning this page */
B
Bruce Momjian 已提交
1558 1559
						Assert(vacpage->offsets_free > 0);
						for (i = 0; i < vacpage->offsets_free; i++)
1560
						{
B
Bruce Momjian 已提交
1561
							if (vacpage->offsets[i] == offnum)
1562 1563
								break;
						}
B
Bruce Momjian 已提交
1564
						if (i >= vacpage->offsets_free) /* not found */
1565
						{
B
Bruce Momjian 已提交
1566
							vacpage->offsets[vacpage->offsets_free++] = offnum;
1567 1568 1569 1570 1571
							keep_tuples--;
						}
					}
					else
					{
B
Bruce Momjian 已提交
1572
						vacpage->offsets[vacpage->offsets_free++] = offnum;
1573 1574 1575 1576 1577
						keep_tuples--;
					}
					continue;
				}
				elog(ERROR, "HEAP_MOVED_OFF was expected");
1578 1579 1580
			}

			/*
B
Bruce Momjian 已提交
1581 1582 1583
			 * If this tuple is in the chain of tuples created in updates
			 * by "recent" transactions then we have to move all chain of
			 * tuples to another places.
1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595
			 *
			 * NOTE: this test is not 100% accurate: it is possible for
			 * a tuple to be an updated one with recent xmin, and yet not
			 * have a corresponding tuple in the vtlinks list.  Presumably
			 * there was once a parent tuple with xmax matching the xmin,
			 * but it's possible that that tuple has been removed --- for
			 * example, if it had xmin = xmax then HeapTupleSatisfiesVacuum
			 * would deem it removable as soon as the xmin xact completes.
			 *
			 * To be on the safe side, we abandon the repair_frag process if
			 * we cannot find the parent tuple in vtlinks.  This may be overly
			 * conservative; AFAICS it would be safe to move the chain.
1596
			 */
1597
			if (((tuple.t_data->t_infomask & HEAP_UPDATED) &&
1598 1599
			 !TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
			                        OldestXmin)) ||
1600 1601
				(!(tuple.t_data->t_infomask & (HEAP_XMAX_INVALID |
											   HEAP_MARKED_FOR_UPDATE)) &&
1602 1603
				 !(ItemPointerEquals(&(tuple.t_self),
									 &(tuple.t_data->t_ctid)))))
1604
			{
B
Bruce Momjian 已提交
1605
				Buffer		Cbuf = buf;
1606 1607
				bool		freeCbuf = false;
				bool		chain_move_failed = false;
B
Bruce Momjian 已提交
1608 1609 1610 1611 1612
				Page		Cpage;
				ItemId		Citemid;
				ItemPointerData Ctid;
				HeapTupleData tp = tuple;
				Size		tlen = tuple_len;
1613 1614 1615
				VTupleMove	vtmove;
				int			num_vtmove;
				int			free_vtmove;
B
Bruce Momjian 已提交
1616
				VacPage		to_vacpage = NULL;
B
Bruce Momjian 已提交
1617 1618
				int			to_item = 0;
				int			ti;
1619 1620 1621 1622 1623 1624

				if (cur_buffer != InvalidBuffer)
				{
					WriteBuffer(cur_buffer);
					cur_buffer = InvalidBuffer;
				}
B
Bruce Momjian 已提交
1625

1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636
				/* Quick exit if we have no vtlinks to search in */
				if (vacrelstats->vtlinks == NULL)
				{
					elog(WARNING, "Parent item in update-chain not found - can't continue repair_frag");
					break;	/* out of walk-along-page loop */
				}

				vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
				num_vtmove = 0;
				free_vtmove = 100;

1637
				/*
B
Bruce Momjian 已提交
1638 1639
				 * If this tuple is in the begin/middle of the chain then
				 * we have to move to the end of chain.
1640
				 */
1641 1642
				while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
												  HEAP_MARKED_FOR_UPDATE)) &&
1643 1644
					   !(ItemPointerEquals(&(tp.t_self),
										   &(tp.t_data->t_ctid))))
1645 1646 1647 1648 1649
				{
					Ctid = tp.t_data->t_ctid;
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					freeCbuf = true;
B
Bruce Momjian 已提交
1650 1651
					Cbuf = ReadBuffer(onerel,
									  ItemPointerGetBlockNumber(&Ctid));
1652
					Cpage = BufferGetPage(Cbuf);
B
Bruce Momjian 已提交
1653 1654
					Citemid = PageGetItemId(Cpage,
									  ItemPointerGetOffsetNumber(&Ctid));
1655
					if (!ItemIdIsUsed(Citemid))
1656 1657
					{
						/*
B
Bruce Momjian 已提交
1658
						 * This means that in the middle of chain there
1659
						 * was tuple updated by older (than OldestXmin)
B
Bruce Momjian 已提交
1660 1661 1662
						 * xaction and this tuple is already deleted by
						 * me. Actually, upper part of chain should be
						 * removed and seems that this should be handled
B
Bruce Momjian 已提交
1663 1664
						 * in scan_heap(), but it's not implemented at the
						 * moment and so we just stop shrinking here.
1665
						 */
B
Bruce Momjian 已提交
1666
						elog(WARNING, "Child itemid in update-chain marked as unused - can't continue repair_frag");
1667 1668
						chain_move_failed = true;
						break;	/* out of loop to move to chain end */
1669
					}
1670
					tp.t_datamcxt = NULL;
1671 1672 1673 1674
					tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
					tp.t_self = Ctid;
					tlen = tp.t_len = ItemIdGetLength(Citemid);
				}
1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685
				if (chain_move_failed)
				{
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					pfree(vtmove);
					break;		/* out of walk-along-page loop */
				}

				/*
				 * Check if all items in chain can be moved
				 */
B
Bruce Momjian 已提交
1686
				for (;;)
1687
				{
1688 1689 1690 1691 1692 1693 1694
					Buffer		Pbuf;
					Page		Ppage;
					ItemId		Pitemid;
					HeapTupleData Ptp;
					VTupleLinkData vtld,
							   *vtlp;

B
Bruce Momjian 已提交
1695 1696
					if (to_vacpage == NULL ||
						!enough_space(to_vacpage, tlen))
1697 1698 1699
					{
						for (i = 0; i < num_fraged_pages; i++)
						{
B
Bruce Momjian 已提交
1700
							if (enough_space(fraged_pages->pagedesc[i], tlen))
1701 1702
								break;
						}
B
Bruce Momjian 已提交
1703 1704

						if (i == num_fraged_pages)
B
Bruce Momjian 已提交
1705
						{
1706
							/* can't move item anywhere */
1707 1708
							chain_move_failed = true;
							break; /* out of check-all-items loop */
1709 1710
						}
						to_item = i;
B
Bruce Momjian 已提交
1711
						to_vacpage = fraged_pages->pagedesc[to_item];
1712
					}
B
Bruce Momjian 已提交
1713 1714 1715 1716
					to_vacpage->free -= MAXALIGN(tlen);
					if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
						to_vacpage->free -= MAXALIGN(sizeof(ItemIdData));
					(to_vacpage->offsets_used)++;
1717 1718 1719
					if (free_vtmove == 0)
					{
						free_vtmove = 1000;
1720 1721 1722 1723
						vtmove = (VTupleMove)
							repalloc(vtmove,
									 (free_vtmove + num_vtmove) *
									 sizeof(VTupleMoveData));
1724 1725
					}
					vtmove[num_vtmove].tid = tp.t_self;
B
Bruce Momjian 已提交
1726 1727
					vtmove[num_vtmove].vacpage = to_vacpage;
					if (to_vacpage->offsets_used == 1)
1728 1729 1730 1731 1732
						vtmove[num_vtmove].cleanVpd = true;
					else
						vtmove[num_vtmove].cleanVpd = false;
					free_vtmove--;
					num_vtmove++;
B
Bruce Momjian 已提交
1733

1734
					/* At beginning of chain? */
B
Bruce Momjian 已提交
1735
					if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
1736 1737
					    TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data),
					                          OldestXmin))
1738
						break;
B
Bruce Momjian 已提交
1739

1740 1741 1742 1743 1744 1745 1746 1747 1748
					/* No, move to tuple with prior row version */
					vtld.new_tid = tp.t_self;
					vtlp = (VTupleLink)
						vac_bsearch((void *) &vtld,
									(void *) (vacrelstats->vtlinks),
									vacrelstats->num_vtlinks,
									sizeof(VTupleLinkData),
									vac_cmp_vtlinks);
					if (vtlp == NULL)
1749
					{
1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769
						/* see discussion above */
						elog(WARNING, "Parent item in update-chain not found - can't continue repair_frag");
						chain_move_failed = true;
						break; /* out of check-all-items loop */
					}
					tp.t_self = vtlp->this_tid;
					Pbuf = ReadBuffer(onerel,
									  ItemPointerGetBlockNumber(&(tp.t_self)));
					Ppage = BufferGetPage(Pbuf);
					Pitemid = PageGetItemId(Ppage,
											ItemPointerGetOffsetNumber(&(tp.t_self)));
					/* this can't happen since we saw tuple earlier: */
					if (!ItemIdIsUsed(Pitemid))
						elog(ERROR, "Parent itemid marked as unused");
					Ptp.t_datamcxt = NULL;
					Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);

					/* ctid should not have changed since we saved it */
					Assert(ItemPointerEquals(&(vtld.new_tid),
											 &(Ptp.t_data->t_ctid)));
B
Bruce Momjian 已提交
1770

1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790
					/*
					 * Read above about cases when
					 * !ItemIdIsUsed(Citemid) (child item is
					 * removed)... Due to the fact that at the moment
					 * we don't remove unuseful part of update-chain,
					 * it's possible to get too old parent row here.
					 * Like as in the case which caused this problem,
					 * we stop shrinking here. I could try to find
					 * real parent row but want not to do it because
					 * of real solution will be implemented anyway,
					 * later, and we are too close to 6.5 release. -
					 * vadim 06/11/99
					 */
					if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(Ptp.t_data),
											  HeapTupleHeaderGetXmin(tp.t_data))))
					{
						ReleaseBuffer(Pbuf);
						elog(WARNING, "Too old parent tuple found - can't continue repair_frag");
						chain_move_failed = true;
						break; /* out of check-all-items loop */
1791
					}
1792 1793 1794 1795 1796 1797 1798 1799 1800
					tp.t_datamcxt = Ptp.t_datamcxt;
					tp.t_data = Ptp.t_data;
					tlen = tp.t_len = ItemIdGetLength(Pitemid);
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					Cbuf = Pbuf;
					freeCbuf = true;
				} /* end of check-all-items loop */

1801 1802
				if (freeCbuf)
					ReleaseBuffer(Cbuf);
1803 1804 1805
				freeCbuf = false;

				if (chain_move_failed)
1806
				{
1807 1808 1809 1810 1811 1812 1813 1814 1815 1816
					/*
					 * Undo changes to offsets_used state.  We don't bother
					 * cleaning up the amount-free state, since we're not
					 * going to do any further tuple motion.
					 */
					for (i = 0; i < num_vtmove; i++)
					{
						Assert(vtmove[i].vacpage->offsets_used > 0);
						(vtmove[i].vacpage->offsets_used)--;
					}
1817
					pfree(vtmove);
1818
					break;		/* out of walk-along-page loop */
1819
				}
1820 1821 1822 1823

				/*
				 * Okay, move the whle tuple chain
				 */
1824 1825 1826
				ItemPointerSetInvalid(&Ctid);
				for (ti = 0; ti < num_vtmove; ti++)
				{
B
Bruce Momjian 已提交
1827
					VacPage		destvacpage = vtmove[ti].vacpage;
1828

V
Vadim B. Mikheev 已提交
1829
					/* Get page to move from */
1830
					tuple.t_self = vtmove[ti].tid;
B
Bruce Momjian 已提交
1831 1832
					Cbuf = ReadBuffer(onerel,
							 ItemPointerGetBlockNumber(&(tuple.t_self)));
V
Vadim B. Mikheev 已提交
1833 1834 1835 1836 1837 1838 1839 1840 1841

					/* Get page to move to */
					cur_buffer = ReadBuffer(onerel, destvacpage->blkno);

					LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
					if (cur_buffer != Cbuf)
						LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);

					ToPage = BufferGetPage(cur_buffer);
1842
					Cpage = BufferGetPage(Cbuf);
V
Vadim B. Mikheev 已提交
1843

B
Bruce Momjian 已提交
1844
					Citemid = PageGetItemId(Cpage,
1845
							ItemPointerGetOffsetNumber(&(tuple.t_self)));
1846
					tuple.t_datamcxt = NULL;
1847 1848
					tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
					tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
1849 1850 1851 1852 1853 1854 1855

					/*
					 * make a copy of the source tuple, and then mark the
					 * source tuple MOVED_OFF.
					 */
					heap_copytuple_with_tuple(&tuple, &newtup);

1856 1857 1858 1859
					/*
					 * register invalidation of source tuple in catcaches.
					 */
					CacheInvalidateHeapTuple(onerel, &tuple);
1860

1861 1862 1863
					/* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
					START_CRIT_SECTION();

1864 1865 1866
					tuple.t_data->t_infomask &=
						~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
					tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1867
					HeapTupleHeaderSetXvac(tuple.t_data, myXID);
1868

1869 1870 1871
					/*
					 * If this page was not used before - clean it.
					 *
1872 1873
					 * NOTE: a nasty bug used to lurk here.  It is possible
					 * for the source and destination pages to be the same
B
Bruce Momjian 已提交
1874 1875 1876 1877 1878 1879 1880
					 * (since this tuple-chain member can be on a page
					 * lower than the one we're currently processing in
					 * the outer loop).  If that's true, then after
					 * vacuum_page() the source tuple will have been
					 * moved, and tuple.t_data will be pointing at
					 * garbage.  Therefore we must do everything that uses
					 * tuple.t_data BEFORE this step!!
1881
					 *
1882
					 * This path is different from the other callers of
B
Bruce Momjian 已提交
1883 1884
					 * vacuum_page, because we have already incremented
					 * the vacpage's offsets_used field to account for the
1885
					 * tuple(s) we expect to move onto the page. Therefore
B
Bruce Momjian 已提交
1886 1887 1888 1889
					 * vacuum_page's check for offsets_used == 0 is wrong.
					 * But since that's a good debugging check for all
					 * other callers, we work around it here rather than
					 * remove it.
1890
					 */
V
Vadim B. Mikheev 已提交
1891
					if (!PageIsEmpty(ToPage) && vtmove[ti].cleanVpd)
1892
					{
B
Bruce Momjian 已提交
1893
						int			sv_offsets_used = destvacpage->offsets_used;
1894

B
Bruce Momjian 已提交
1895
						destvacpage->offsets_used = 0;
1896
						vacuum_page(onerel, cur_buffer, destvacpage);
B
Bruce Momjian 已提交
1897
						destvacpage->offsets_used = sv_offsets_used;
1898
					}
1899 1900 1901 1902 1903

					/*
					 * Update the state of the copied tuple, and store it
					 * on the destination page.
					 */
B
Bruce Momjian 已提交
1904 1905
					newtup.t_data->t_infomask &=
						~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1906
					newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1907
					HeapTupleHeaderSetXvac(newtup.t_data, myXID);
1908
					newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
B
Bruce Momjian 已提交
1909
										 InvalidOffsetNumber, LP_USED);
1910 1911
					if (newoff == InvalidOffsetNumber)
					{
1912
						elog(PANIC, "moving chain: failed to add item with len = %lu to page %u",
B
Bruce Momjian 已提交
1913
						  (unsigned long) tuple_len, destvacpage->blkno);
1914 1915 1916
					}
					newitemid = PageGetItemId(ToPage, newoff);
					pfree(newtup.t_data);
1917
					newtup.t_datamcxt = NULL;
1918
					newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
B
Bruce Momjian 已提交
1919
					ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
V
Vadim B. Mikheev 已提交
1920

1921 1922
					/* XLOG stuff */
					if (!onerel->rd_istemp)
V
Vadim B. Mikheev 已提交
1923
					{
B
Bruce Momjian 已提交
1924 1925 1926
						XLogRecPtr	recptr =
						log_heap_move(onerel, Cbuf, tuple.t_self,
									  cur_buffer, &newtup);
V
Vadim B. Mikheev 已提交
1927 1928 1929 1930 1931 1932 1933 1934 1935

						if (Cbuf != cur_buffer)
						{
							PageSetLSN(Cpage, recptr);
							PageSetSUI(Cpage, ThisStartUpID);
						}
						PageSetLSN(ToPage, recptr);
						PageSetSUI(ToPage, ThisStartUpID);
					}
1936 1937 1938 1939 1940 1941
					else
					{
						/* No XLOG record, but still need to flag that XID exists on disk */
						MyXactMadeTempRelUpdate = true;
					}

1942
					END_CRIT_SECTION();
V
Vadim B. Mikheev 已提交
1943

1944
					if (destvacpage->blkno > last_move_dest_block)
B
Bruce Momjian 已提交
1945
						last_move_dest_block = destvacpage->blkno;
B
Bruce Momjian 已提交
1946

1947
					/*
1948
					 * Set new tuple's t_ctid pointing to itself for last
B
Bruce Momjian 已提交
1949 1950
					 * tuple in chain, and to next tuple in chain
					 * otherwise.
1951 1952 1953 1954 1955 1956 1957 1958
					 */
					if (!ItemPointerIsValid(&Ctid))
						newtup.t_data->t_ctid = newtup.t_self;
					else
						newtup.t_data->t_ctid = Ctid;
					Ctid = newtup.t_self;

					num_moved++;
B
Bruce Momjian 已提交
1959

1960 1961 1962 1963
					/*
					 * Remember that we moved tuple from the current page
					 * (corresponding index tuple will be cleaned).
					 */
1964
					if (Cbuf == buf)
B
Bruce Momjian 已提交
1965
						vacpage->offsets[vacpage->offsets_free++] =
B
Bruce Momjian 已提交
1966
							ItemPointerGetOffsetNumber(&(tuple.t_self));
1967 1968
					else
						keep_tuples++;
1969

V
Vadim B. Mikheev 已提交
1970 1971 1972 1973
					LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
					if (cur_buffer != Cbuf)
						LockBuffer(Cbuf, BUFFER_LOCK_UNLOCK);

1974 1975
					/* Create index entries for the moved tuple */
					if (resultRelInfo->ri_NumIndices > 0)
1976
					{
1977 1978 1979
						ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
						ExecInsertIndexTuples(slot, &(newtup.t_self),
											  estate, true);
1980
					}
1981

1982
					WriteBuffer(cur_buffer);
1983
					WriteBuffer(Cbuf);
1984 1985
				} /* end of move-the-tuple-chain loop */

1986 1987
				cur_buffer = InvalidBuffer;
				pfree(vtmove);
1988
				chain_tuple_moved = true;
1989 1990

				/* advance to next tuple in walk-along-page loop */
1991
				continue;
1992
			} /* end of is-tuple-in-chain test */
1993

1994
			/* try to find new page for this tuple */
B
Bruce Momjian 已提交
1995
			if (cur_buffer == InvalidBuffer ||
B
Bruce Momjian 已提交
1996
				!enough_space(cur_page, tuple_len))
1997
			{
B
Bruce Momjian 已提交
1998
				if (cur_buffer != InvalidBuffer)
1999
				{
B
Bruce Momjian 已提交
2000 2001
					WriteBuffer(cur_buffer);
					cur_buffer = InvalidBuffer;
2002
				}
B
Bruce Momjian 已提交
2003
				for (i = 0; i < num_fraged_pages; i++)
2004
				{
B
Bruce Momjian 已提交
2005
					if (enough_space(fraged_pages->pagedesc[i], tuple_len))
2006 2007
						break;
				}
B
Bruce Momjian 已提交
2008
				if (i == num_fraged_pages)
2009
					break;		/* can't move item anywhere */
B
Bruce Momjian 已提交
2010
				cur_item = i;
B
Bruce Momjian 已提交
2011 2012
				cur_page = fraged_pages->pagedesc[cur_item];
				cur_buffer = ReadBuffer(onerel, cur_page->blkno);
V
Vadim B. Mikheev 已提交
2013
				LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
B
Bruce Momjian 已提交
2014
				ToPage = BufferGetPage(cur_buffer);
2015
				/* if this page was not used before - clean it */
B
Bruce Momjian 已提交
2016
				if (!PageIsEmpty(ToPage) && cur_page->offsets_used == 0)
2017
					vacuum_page(onerel, cur_buffer, cur_page);
2018
			}
V
Vadim B. Mikheev 已提交
2019 2020 2021 2022
			else
				LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);

			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2023 2024

			/* copy tuple */
2025
			heap_copytuple_with_tuple(&tuple, &newtup);
2026

2027 2028 2029 2030 2031 2032 2033 2034 2035
			/*
			 * register invalidation of source tuple in catcaches.
			 *
			 * (Note: we do not need to register the copied tuple,
			 * because we are not changing the tuple contents and
			 * so there cannot be any need to flush negative
			 * catcache entries.)
			 */
			CacheInvalidateHeapTuple(onerel, &tuple);
2036

2037 2038 2039
			/* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
			START_CRIT_SECTION();

B
Bruce Momjian 已提交
2040 2041
			/*
			 * Mark new tuple as moved_in by vacuum and store vacuum XID
2042
			 * in t_cid !!!
2043
			 */
B
Bruce Momjian 已提交
2044 2045
			newtup.t_data->t_infomask &=
				~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
2046
			newtup.t_data->t_infomask |= HEAP_MOVED_IN;
2047
			HeapTupleHeaderSetXvac(newtup.t_data, myXID);
2048 2049

			/* add tuple to the page */
2050
			newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
2051 2052 2053
								 InvalidOffsetNumber, LP_USED);
			if (newoff == InvalidOffsetNumber)
			{
2054
				elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
2055 2056
					 (unsigned long) tuple_len,
					 cur_page->blkno, (unsigned long) cur_page->free,
B
Bruce Momjian 已提交
2057
					 cur_page->offsets_used, cur_page->offsets_free);
2058 2059
			}
			newitemid = PageGetItemId(ToPage, newoff);
2060
			pfree(newtup.t_data);
2061
			newtup.t_datamcxt = NULL;
2062
			newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
B
Bruce Momjian 已提交
2063
			ItemPointerSet(&(newtup.t_data->t_ctid), cur_page->blkno, newoff);
2064
			newtup.t_self = newtup.t_data->t_ctid;
2065

B
Bruce Momjian 已提交
2066 2067
			/*
			 * Mark old tuple as moved_off by vacuum and store vacuum XID
2068
			 * in t_cid !!!
2069
			 */
B
Bruce Momjian 已提交
2070 2071
			tuple.t_data->t_infomask &=
				~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
2072
			tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
2073
			HeapTupleHeaderSetXvac(tuple.t_data, myXID);
2074

2075 2076
			/* XLOG stuff */
			if (!onerel->rd_istemp)
V
Vadim B. Mikheev 已提交
2077
			{
B
Bruce Momjian 已提交
2078 2079 2080
				XLogRecPtr	recptr =
				log_heap_move(onerel, buf, tuple.t_self,
							  cur_buffer, &newtup);
V
Vadim B. Mikheev 已提交
2081 2082 2083 2084 2085 2086

				PageSetLSN(page, recptr);
				PageSetSUI(page, ThisStartUpID);
				PageSetLSN(ToPage, recptr);
				PageSetSUI(ToPage, ThisStartUpID);
			}
2087 2088 2089 2090 2091 2092
			else
			{
				/* No XLOG record, but still need to flag that XID exists on disk */
				MyXactMadeTempRelUpdate = true;
			}

2093
			END_CRIT_SECTION();
V
Vadim B. Mikheev 已提交
2094

B
Bruce Momjian 已提交
2095
			cur_page->offsets_used++;
B
Bruce Momjian 已提交
2096
			num_moved++;
B
Bruce Momjian 已提交
2097
			cur_page->free = ((PageHeader) ToPage)->pd_upper - ((PageHeader) ToPage)->pd_lower;
2098
			if (cur_page->blkno > last_move_dest_block)
B
Bruce Momjian 已提交
2099
				last_move_dest_block = cur_page->blkno;
2100

B
Bruce Momjian 已提交
2101
			vacpage->offsets[vacpage->offsets_free++] = offnum;
2102

V
Vadim B. Mikheev 已提交
2103 2104 2105
			LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);

2106
			/* insert index' tuples if needed */
2107
			if (resultRelInfo->ri_NumIndices > 0)
2108
			{
2109 2110
				ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
				ExecInsertIndexTuples(slot, &(newtup.t_self), estate, true);
2111
			}
B
Bruce Momjian 已提交
2112
		}						/* walk along page */
2113

2114 2115 2116 2117 2118 2119
		/*
		 * If we broke out of the walk-along-page loop early (ie, still have
		 * offnum <= maxoff), then we failed to move some tuple off
		 * this page.  No point in shrinking any more, so clean up and
		 * exit the per-page loop.
		 */
2120 2121
		if (offnum < maxoff && keep_tuples > 0)
		{
B
Bruce Momjian 已提交
2122
			OffsetNumber off;
2123

2124 2125 2126
			/*
			 * Fix vacpage state for any unvisited tuples remaining on page
			 */
2127
			for (off = OffsetNumberNext(offnum);
B
Bruce Momjian 已提交
2128 2129
				 off <= maxoff;
				 off = OffsetNumberNext(off))
2130 2131 2132 2133
			{
				itemid = PageGetItemId(page, off);
				if (!ItemIdIsUsed(itemid))
					continue;
2134
				tuple.t_datamcxt = NULL;
2135 2136 2137 2138 2139 2140 2141
				tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
				if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)
					continue;
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
					elog(ERROR, "HEAP_MOVED_IN was not expected (2)");
				if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
2142 2143
					if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
						elog(ERROR, "Invalid XVAC in tuple header (4)");
B
Bruce Momjian 已提交
2144
					/* some chains was moved while */
B
Bruce Momjian 已提交
2145 2146
					if (chain_tuple_moved)
					{			/* cleaning this page */
B
Bruce Momjian 已提交
2147 2148
						Assert(vacpage->offsets_free > 0);
						for (i = 0; i < vacpage->offsets_free; i++)
2149
						{
B
Bruce Momjian 已提交
2150
							if (vacpage->offsets[i] == off)
2151 2152
								break;
						}
B
Bruce Momjian 已提交
2153
						if (i >= vacpage->offsets_free) /* not found */
2154
						{
B
Bruce Momjian 已提交
2155
							vacpage->offsets[vacpage->offsets_free++] = off;
2156 2157 2158 2159 2160 2161
							Assert(keep_tuples > 0);
							keep_tuples--;
						}
					}
					else
					{
B
Bruce Momjian 已提交
2162
						vacpage->offsets[vacpage->offsets_free++] = off;
2163 2164 2165 2166
						Assert(keep_tuples > 0);
						keep_tuples--;
					}
				}
2167 2168
				else
					elog(ERROR, "HEAP_MOVED_OFF was expected (2)");
2169 2170 2171
			}
		}

B
Bruce Momjian 已提交
2172
		if (vacpage->offsets_free > 0)	/* some tuples were moved */
V
Vadim B. Mikheev 已提交
2173
		{
2174 2175
			if (chain_tuple_moved)		/* else - they are ordered */
			{
B
Bruce Momjian 已提交
2176
				qsort((char *) (vacpage->offsets), vacpage->offsets_free,
B
Bruce Momjian 已提交
2177
					  sizeof(OffsetNumber), vac_cmp_offno);
2178
			}
2179
			vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
2180
			WriteBuffer(buf);
V
Vadim B. Mikheev 已提交
2181
		}
2182 2183 2184 2185
		else if (dowrite)
			WriteBuffer(buf);
		else
			ReleaseBuffer(buf);
V
Vadim B. Mikheev 已提交
2186

2187
		if (offnum <= maxoff)
2188
			break;				/* had to quit early, see above note */
2189 2190 2191 2192 2193

	}							/* walk along relation */

	blkno++;					/* new number of blocks */

B
Bruce Momjian 已提交
2194
	if (cur_buffer != InvalidBuffer)
V
Vadim B. Mikheev 已提交
2195
	{
B
Bruce Momjian 已提交
2196 2197
		Assert(num_moved > 0);
		WriteBuffer(cur_buffer);
V
Vadim B. Mikheev 已提交
2198
	}
2199

B
Bruce Momjian 已提交
2200
	if (num_moved > 0)
V
Vadim B. Mikheev 已提交
2201
	{
2202
		/*
2203 2204 2205 2206
		 * We have to commit our tuple movings before we truncate the
		 * relation.  Ideally we should do Commit/StartTransactionCommand
		 * here, relying on the session-level table lock to protect our
		 * exclusive access to the relation.  However, that would require
2207
		 * a lot of extra code to close and re-open the relation, indexes,
B
Bruce Momjian 已提交
2208 2209
		 * etc.  For now, a quick hack: record status of current
		 * transaction as committed, and continue.
2210
		 */
V
Vadim B. Mikheev 已提交
2211
		RecordTransactionCommit();
V
Vadim B. Mikheev 已提交
2212
	}
2213 2214

	/*
2215 2216
	 * We are not going to move any more tuples across pages, but we still
	 * need to apply vacuum_page to compact free space in the remaining
2217 2218 2219
	 * pages in vacuum_pages list.	Note that some of these pages may also
	 * be in the fraged_pages list, and may have had tuples moved onto
	 * them; if so, we already did vacuum_page and needn't do it again.
2220
	 */
2221 2222 2223
	for (i = 0, curpage = vacuum_pages->pagedesc;
		 i < vacuumed_pages;
		 i++, curpage++)
V
Vadim B. Mikheev 已提交
2224
	{
2225
		CHECK_FOR_INTERRUPTS();
2226
		Assert((*curpage)->blkno < blkno);
2227
		if ((*curpage)->offsets_used == 0)
2228
		{
2229 2230 2231 2232
			/* this page was not used as a move target, so must clean it */
			buf = ReadBuffer(onerel, (*curpage)->blkno);
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			page = BufferGetPage(buf);
2233
			if (!PageIsEmpty(page))
2234
				vacuum_page(onerel, buf, *curpage);
2235 2236
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
			WriteBuffer(buf);
2237
		}
2238 2239 2240
	}

	/*
2241 2242 2243
	 * Now scan all the pages that we moved tuples onto and update tuple
	 * status bits.  This is not really necessary, but will save time for
	 * future transactions examining these tuples.
2244
	 *
B
Bruce Momjian 已提交
2245
	 * XXX WARNING that this code fails to clear HEAP_MOVED_OFF tuples from
2246 2247 2248 2249
	 * pages that were move source pages but not move dest pages.  One
	 * also wonders whether it wouldn't be better to skip this step and
	 * let the tuple status updates happen someplace that's not holding an
	 * exclusive lock on the relation.
2250 2251 2252 2253 2254 2255
	 */
	checked_moved = 0;
	for (i = 0, curpage = fraged_pages->pagedesc;
		 i < num_fraged_pages;
		 i++, curpage++)
	{
2256
		CHECK_FOR_INTERRUPTS();
2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269
		Assert((*curpage)->blkno < blkno);
		if ((*curpage)->blkno > last_move_dest_block)
			break;				/* no need to scan any further */
		if ((*curpage)->offsets_used == 0)
			continue;			/* this page was never used as a move dest */
		buf = ReadBuffer(onerel, (*curpage)->blkno);
		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
		page = BufferGetPage(buf);
		num_tuples = 0;
		max_offset = PageGetMaxOffsetNumber(page);
		for (newoff = FirstOffsetNumber;
			 newoff <= max_offset;
			 newoff = OffsetNumberNext(newoff))
2270
		{
2271 2272 2273 2274 2275 2276
			itemid = PageGetItemId(page, newoff);
			if (!ItemIdIsUsed(itemid))
				continue;
			tuple.t_datamcxt = NULL;
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
2277
			{
2278 2279
				if (!(tuple.t_data->t_infomask & HEAP_MOVED))
					elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
2280 2281
				if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
					elog(ERROR, "Invalid XVAC in tuple header (2)");
2282
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
2283
				{
2284
					tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
2285
					tuple.t_data->t_infomask &= ~HEAP_MOVED;
2286
					num_tuples++;
2287
				}
2288
				else
2289
					tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
2290 2291
			}
		}
2292
		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2293
		WriteBuffer(buf);
2294 2295
		Assert((*curpage)->offsets_used == num_tuples);
		checked_moved += num_tuples;
V
Vadim B. Mikheev 已提交
2296
	}
B
Bruce Momjian 已提交
2297
	Assert(num_moved == checked_moved);
2298

2299
	elog(elevel, "Rel %s: Pages: %u --> %u; Tuple(s) moved: %u.\n\t%s",
2300
		 RelationGetRelationName(onerel),
B
Bruce Momjian 已提交
2301
		 nblocks, blkno, num_moved,
2302
		 vac_show_rusage(&ru0));
2303

B
Bruce Momjian 已提交
2304
	/*
2305 2306
	 * Reflect the motion of system tuples to catalog cache here.
	 */
2307
	CommandCounterIncrement();
2308

B
Bruce Momjian 已提交
2309
	if (Nvacpagelist.num_pages > 0)
V
Vadim B. Mikheev 已提交
2310
	{
2311
		/* vacuum indexes again if needed */
2312 2313
		if (Irel != (Relation *) NULL)
		{
B
Bruce Momjian 已提交
2314
			VacPage    *vpleft,
2315 2316
					   *vpright,
						vpsave;
2317

B
Bruce Momjian 已提交
2318 2319
			/* re-sort Nvacpagelist.pagedesc */
			for (vpleft = Nvacpagelist.pagedesc,
B
Bruce Momjian 已提交
2320
			vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
2321 2322 2323 2324 2325 2326
				 vpleft < vpright; vpleft++, vpright--)
			{
				vpsave = *vpleft;
				*vpleft = *vpright;
				*vpright = vpsave;
			}
2327
			Assert(keep_tuples >= 0);
2328
			for (i = 0; i < nindexes; i++)
B
Bruce Momjian 已提交
2329
				vacuum_index(&Nvacpagelist, Irel[i],
2330
							 vacrelstats->rel_tuples, keep_tuples);
2331 2332
		}

B
Bruce Momjian 已提交
2333
		/* clean moved tuples from last page in Nvacpagelist list */
2334
		if (vacpage->blkno == (blkno - 1) &&
B
Bruce Momjian 已提交
2335
			vacpage->offsets_free > 0)
2336
		{
2337
			OffsetNumber unbuf[BLCKSZ / sizeof(OffsetNumber)];
2338
			OffsetNumber *unused = unbuf;
B
Bruce Momjian 已提交
2339
			int			uncnt;
2340

B
Bruce Momjian 已提交
2341
			buf = ReadBuffer(onerel, vacpage->blkno);
2342
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2343
			page = BufferGetPage(buf);
B
Bruce Momjian 已提交
2344
			num_tuples = 0;
2345
			maxoff = PageGetMaxOffsetNumber(page);
2346
			for (offnum = FirstOffsetNumber;
2347
				 offnum <= maxoff;
2348 2349 2350 2351 2352
				 offnum = OffsetNumberNext(offnum))
			{
				itemid = PageGetItemId(page, offnum);
				if (!ItemIdIsUsed(itemid))
					continue;
2353
				tuple.t_datamcxt = NULL;
2354
				tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2355 2356 2357 2358 2359

				if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
				{
					if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
					{
2360 2361
						if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
							elog(ERROR, "Invalid XVAC in tuple header (3)");
2362 2363 2364 2365
						itemid->lp_flags &= ~LP_USED;
						num_tuples++;
					}
					else
2366
						elog(ERROR, "HEAP_MOVED_OFF was expected (3)");
2367 2368
				}

2369
			}
B
Bruce Momjian 已提交
2370
			Assert(vacpage->offsets_free == num_tuples);
2371

2372
			START_CRIT_SECTION();
2373

2374
			uncnt = PageRepairFragmentation(page, unused);
2375 2376 2377

			/* XLOG stuff */
			if (!onerel->rd_istemp)
2378
			{
2379
				XLogRecPtr	recptr;
B
Bruce Momjian 已提交
2380 2381 2382

				recptr = log_heap_clean(onerel, buf, (char *) unused,
						  (char *) (&(unused[uncnt])) - (char *) unused);
2383 2384 2385
				PageSetLSN(page, recptr);
				PageSetSUI(page, ThisStartUpID);
			}
2386 2387 2388 2389 2390 2391
			else
			{
				/* No XLOG record, but still need to flag that XID exists on disk */
				MyXactMadeTempRelUpdate = true;
			}

2392
			END_CRIT_SECTION();
2393

2394
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2395 2396 2397
			WriteBuffer(buf);
		}

B
Bruce Momjian 已提交
2398
		/* now - free new list of reaped pages */
B
Bruce Momjian 已提交
2399 2400 2401 2402
		curpage = Nvacpagelist.pagedesc;
		for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
			pfree(*curpage);
		pfree(Nvacpagelist.pagedesc);
V
Vadim B. Mikheev 已提交
2403 2404
	}

2405 2406
	/*
	 * Flush dirty pages out to disk.  We do this unconditionally, even if
B
Bruce Momjian 已提交
2407 2408 2409
	 * we don't need to truncate, because we want to ensure that all
	 * tuples have correct on-row commit status on disk (see bufmgr.c's
	 * comments for FlushRelationBuffers()).
2410 2411 2412 2413 2414 2415 2416
	 */
	i = FlushRelationBuffers(onerel, blkno);
	if (i < 0)
		elog(ERROR, "VACUUM (repair_frag): FlushRelationBuffers returned %d",
			 i);

	/* truncate relation, if needed */
2417
	if (blkno < nblocks)
V
Vadim B. Mikheev 已提交
2418
	{
B
Bruce Momjian 已提交
2419
		blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
2420
		onerel->rd_nblocks = blkno;		/* update relcache immediately */
2421
		onerel->rd_targblock = InvalidBlockNumber;
2422
		vacrelstats->rel_pages = blkno; /* set new number of blocks */
2423 2424
	}

2425
	/* clean up */
B
Bruce Momjian 已提交
2426
	pfree(vacpage);
2427 2428
	if (vacrelstats->vtlinks != NULL)
		pfree(vacrelstats->vtlinks);
2429 2430 2431 2432

	ExecDropTupleTable(tupleTable, true);

	ExecCloseIndices(resultRelInfo);
B
Bruce Momjian 已提交
2433
}
V
Vadim B. Mikheev 已提交
2434 2435

/*
B
Bruce Momjian 已提交
2436
 *	vacuum_heap() -- free dead tuples
V
Vadim B. Mikheev 已提交
2437
 *
2438 2439
 *		This routine marks dead tuples as unused and truncates relation
 *		if there are "empty" end-blocks.
V
Vadim B. Mikheev 已提交
2440 2441
 */
static void
B
Bruce Momjian 已提交
2442
vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
V
Vadim B. Mikheev 已提交
2443
{
2444
	Buffer		buf;
B
Bruce Momjian 已提交
2445
	VacPage    *vacpage;
2446
	BlockNumber relblocks;
2447
	int			nblocks;
2448
	int			i;
2449

B
Bruce Momjian 已提交
2450
	nblocks = vacuum_pages->num_pages;
B
Bruce Momjian 已提交
2451
	nblocks -= vacuum_pages->empty_end_pages;	/* nothing to do with them */
2452

B
Bruce Momjian 已提交
2453
	for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
2454
	{
2455
		CHECK_FOR_INTERRUPTS();
B
Bruce Momjian 已提交
2456
		if ((*vacpage)->offsets_free > 0)
2457
		{
B
Bruce Momjian 已提交
2458
			buf = ReadBuffer(onerel, (*vacpage)->blkno);
2459 2460 2461
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			vacuum_page(onerel, buf, *vacpage);
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2462 2463
			WriteBuffer(buf);
		}
V
Vadim B. Mikheev 已提交
2464 2465
	}

2466 2467
	/*
	 * Flush dirty pages out to disk.  We do this unconditionally, even if
B
Bruce Momjian 已提交
2468 2469 2470
	 * we don't need to truncate, because we want to ensure that all
	 * tuples have correct on-row commit status on disk (see bufmgr.c's
	 * comments for FlushRelationBuffers()).
2471
	 */
2472
	Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
2473
	relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
2474

2475
	i = FlushRelationBuffers(onerel, relblocks);
2476 2477 2478 2479
	if (i < 0)
		elog(ERROR, "VACUUM (vacuum_heap): FlushRelationBuffers returned %d",
			 i);

2480
	/* truncate relation if there are some empty end-pages */
B
Bruce Momjian 已提交
2481
	if (vacuum_pages->empty_end_pages > 0)
2482
	{
2483
		elog(elevel, "Rel %s: Pages: %u --> %u.",
2484
			 RelationGetRelationName(onerel),
2485 2486
			 vacrelstats->rel_pages, relblocks);
		relblocks = smgrtruncate(DEFAULT_SMGR, onerel, relblocks);
2487
		onerel->rd_nblocks = relblocks; /* update relcache immediately */
2488
		onerel->rd_targblock = InvalidBlockNumber;
2489
		vacrelstats->rel_pages = relblocks;		/* set new number of
B
Bruce Momjian 已提交
2490
												 * blocks */
2491
	}
B
Bruce Momjian 已提交
2492
}
V
Vadim B. Mikheev 已提交
2493 2494

/*
B
Bruce Momjian 已提交
2495
 *	vacuum_page() -- free dead tuples on a page
2496
 *					 and repair its fragmentation.
V
Vadim B. Mikheev 已提交
2497 2498
 */
static void
2499
vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
V
Vadim B. Mikheev 已提交
2500
{
2501
	OffsetNumber unbuf[BLCKSZ / sizeof(OffsetNumber)];
2502
	OffsetNumber *unused = unbuf;
B
Bruce Momjian 已提交
2503 2504 2505 2506
	int			uncnt;
	Page		page = BufferGetPage(buffer);
	ItemId		itemid;
	int			i;
2507

2508
	/* There shouldn't be any tuples moved onto the page yet! */
B
Bruce Momjian 已提交
2509
	Assert(vacpage->offsets_used == 0);
2510

2511
	START_CRIT_SECTION();
2512

B
Bruce Momjian 已提交
2513
	for (i = 0; i < vacpage->offsets_free; i++)
V
Vadim B. Mikheev 已提交
2514
	{
2515
		itemid = PageGetItemId(page, vacpage->offsets[i]);
2516
		itemid->lp_flags &= ~LP_USED;
V
Vadim B. Mikheev 已提交
2517
	}
2518

2519
	uncnt = PageRepairFragmentation(page, unused);
2520 2521 2522

	/* XLOG stuff */
	if (!onerel->rd_istemp)
2523
	{
2524
		XLogRecPtr	recptr;
B
Bruce Momjian 已提交
2525 2526 2527

		recptr = log_heap_clean(onerel, buffer, (char *) unused,
						  (char *) (&(unused[uncnt])) - (char *) unused);
2528 2529 2530
		PageSetLSN(page, recptr);
		PageSetSUI(page, ThisStartUpID);
	}
2531 2532 2533 2534 2535 2536
	else
	{
		/* No XLOG record, but still need to flag that XID exists on disk */
		MyXactMadeTempRelUpdate = true;
	}

2537
	END_CRIT_SECTION();
B
Bruce Momjian 已提交
2538
}
2539

2540
/*
2541
 *	scan_index() -- scan one index relation to update statistic.
2542 2543
 *
 * We use this when we have no deletions to do.
2544 2545
 */
static void
2546
scan_index(Relation indrel, double num_tuples)
2547
{
2548
	IndexBulkDeleteResult *stats;
2549
	VacRUsage	ru0;
2550

2551
	vac_init_rusage(&ru0);
2552

2553 2554
	/*
	 * Even though we're not planning to delete anything, use the
2555 2556
	 * ambulkdelete call, so that the scan happens within the index AM for
	 * more speed.
2557 2558
	 */
	stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
2559

2560 2561
	if (!stats)
		return;
2562

2563
	/* now update statistics in pg_class */
2564 2565 2566
	vac_update_relstats(RelationGetRelid(indrel),
						stats->num_pages, stats->num_index_tuples,
						false);
2567

2568
	elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s",
2569 2570
		 RelationGetRelationName(indrel),
		 stats->num_pages, stats->num_index_tuples,
2571
		 vac_show_rusage(&ru0));
2572

2573
	/*
2574 2575
	 * Check for tuple count mismatch.	If the index is partial, then it's
	 * OK for it to have fewer tuples than the heap; else we got trouble.
2576
	 */
2577
	if (stats->num_index_tuples != num_tuples)
2578
	{
2579
		if (stats->num_index_tuples > num_tuples ||
2580
			!vac_is_partial_index(indrel))
B
Bruce Momjian 已提交
2581
			elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f).\
2582
\n\tRecreate the index.",
2583 2584
				 RelationGetRelationName(indrel),
				 stats->num_index_tuples, num_tuples);
2585
	}
2586 2587

	pfree(stats);
B
Bruce Momjian 已提交
2588
}
2589

2590
/*
B
Bruce Momjian 已提交
2591
 *	vacuum_index() -- vacuum one index relation.
2592
 *
B
Bruce Momjian 已提交
2593
 *		Vpl is the VacPageList of the heap we're currently vacuuming.
2594
 *		It's locked. Indrel is an index relation on the vacuumed heap.
2595 2596 2597
 *
 *		We don't bother to set locks on the index relation here, since
 *		the parent table is exclusive-locked already.
2598
 *
2599 2600
 *		Finally, we arrange to update the index relation's statistics in
 *		pg_class.
2601 2602
 */
static void
2603
vacuum_index(VacPageList vacpagelist, Relation indrel,
2604
			 double num_tuples, int keep_tuples)
2605
{
2606
	IndexBulkDeleteResult *stats;
2607
	VacRUsage	ru0;
2608

2609
	vac_init_rusage(&ru0);
2610

2611 2612
	/* Do bulk deletion */
	stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist);
2613

2614 2615
	if (!stats)
		return;
2616

2617
	/* now update statistics in pg_class */
2618
	vac_update_relstats(RelationGetRelid(indrel),
2619 2620
						stats->num_pages, stats->num_index_tuples,
						false);
2621

2622
	elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s",
2623 2624
		 RelationGetRelationName(indrel), stats->num_pages,
		 stats->num_index_tuples - keep_tuples, stats->tuples_removed,
2625
		 vac_show_rusage(&ru0));
2626

2627
	/*
2628 2629
	 * Check for tuple count mismatch.	If the index is partial, then it's
	 * OK for it to have fewer tuples than the heap; else we got trouble.
2630
	 */
2631
	if (stats->num_index_tuples != num_tuples + keep_tuples)
2632
	{
2633
		if (stats->num_index_tuples > num_tuples + keep_tuples ||
2634
			!vac_is_partial_index(indrel))
B
Bruce Momjian 已提交
2635
			elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f).\
2636
\n\tRecreate the index.",
2637 2638
				 RelationGetRelationName(indrel),
				 stats->num_index_tuples, num_tuples);
2639
	}
2640 2641

	pfree(stats);
B
Bruce Momjian 已提交
2642
}
V
Vadim B. Mikheev 已提交
2643 2644

/*
B
Bruce Momjian 已提交
2645
 *	tid_reaped() -- is a particular tid reaped?
V
Vadim B. Mikheev 已提交
2646
 *
2647 2648
 *		This has the right signature to be an IndexBulkDeleteCallback.
 *
B
Bruce Momjian 已提交
2649
 *		vacpagelist->VacPage_array is sorted in right order.
V
Vadim B. Mikheev 已提交
2650
 */
2651 2652
static bool
tid_reaped(ItemPointer itemptr, void *state)
V
Vadim B. Mikheev 已提交
2653
{
2654
	VacPageList vacpagelist = (VacPageList) state;
2655 2656
	OffsetNumber ioffno;
	OffsetNumber *voff;
B
Bruce Momjian 已提交
2657
	VacPage		vp,
2658
			   *vpp;
B
Bruce Momjian 已提交
2659
	VacPageData vacpage;
V
Vadim B. Mikheev 已提交
2660

B
Bruce Momjian 已提交
2661
	vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
2662
	ioffno = ItemPointerGetOffsetNumber(itemptr);
V
Vadim B. Mikheev 已提交
2663

B
Bruce Momjian 已提交
2664
	vp = &vacpage;
2665 2666 2667 2668
	vpp = (VacPage *) vac_bsearch((void *) &vp,
								  (void *) (vacpagelist->pagedesc),
								  vacpagelist->num_pages,
								  sizeof(VacPage),
B
Bruce Momjian 已提交
2669
								  vac_cmp_blk);
V
Vadim B. Mikheev 已提交
2670

2671 2672
	if (vpp == NULL)
		return false;
2673

2674 2675
	/* ok - we are on a partially or fully reaped page */
	vp = *vpp;
2676

B
Bruce Momjian 已提交
2677
	if (vp->offsets_free == 0)
2678 2679
	{
		/* this is EmptyPage, so claim all tuples on it are reaped!!! */
2680
		return true;
2681 2682
	}

2683 2684 2685 2686
	voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
										(void *) (vp->offsets),
										vp->offsets_free,
										sizeof(OffsetNumber),
B
Bruce Momjian 已提交
2687
										vac_cmp_offno);
2688

2689 2690
	if (voff == NULL)
		return false;
2691

2692
	/* tid is reaped */
2693
	return true;
B
Bruce Momjian 已提交
2694
}
2695

2696 2697 2698 2699 2700 2701 2702 2703 2704
/*
 * Dummy version for scan_index.
 */
static bool
dummy_tid_reaped(ItemPointer itemptr, void *state)
{
	return false;
}

2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725
/*
 * Update the shared Free Space Map with the info we now have about
 * free space in the relation, discarding any old info the map may have.
 */
static void
vac_update_fsm(Relation onerel, VacPageList fraged_pages,
			   BlockNumber rel_pages)
{
	int			nPages = fraged_pages->num_pages;
	int			i;
	BlockNumber *pages;
	Size	   *spaceAvail;

	/* +1 to avoid palloc(0) */
	pages = (BlockNumber *) palloc((nPages + 1) * sizeof(BlockNumber));
	spaceAvail = (Size *) palloc((nPages + 1) * sizeof(Size));

	for (i = 0; i < nPages; i++)
	{
		pages[i] = fraged_pages->pagedesc[i]->blkno;
		spaceAvail[i] = fraged_pages->pagedesc[i]->free;
2726

2727
		/*
2728 2729 2730
		 * fraged_pages may contain entries for pages that we later
		 * decided to truncate from the relation; don't enter them into
		 * the map!
2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745
		 */
		if (pages[i] >= rel_pages)
		{
			nPages = i;
			break;
		}
	}

	MultiRecordFreeSpace(&onerel->rd_node,
						 0, MaxBlockNumber,
						 nPages, pages, spaceAvail);
	pfree(pages);
	pfree(spaceAvail);
}

2746 2747 2748
/* Copy a VacPage structure */
static VacPage
copy_vac_page(VacPage vacpage)
2749
{
B
Bruce Momjian 已提交
2750
	VacPage		newvacpage;
2751

B
Bruce Momjian 已提交
2752
	/* allocate a VacPageData entry */
2753
	newvacpage = (VacPage) palloc(sizeof(VacPageData) +
2754
						   vacpage->offsets_free * sizeof(OffsetNumber));
2755

2756
	/* fill it in */
B
Bruce Momjian 已提交
2757
	if (vacpage->offsets_free > 0)
2758 2759
		memcpy(newvacpage->offsets, vacpage->offsets,
			   vacpage->offsets_free * sizeof(OffsetNumber));
B
Bruce Momjian 已提交
2760 2761 2762 2763
	newvacpage->blkno = vacpage->blkno;
	newvacpage->free = vacpage->free;
	newvacpage->offsets_used = vacpage->offsets_used;
	newvacpage->offsets_free = vacpage->offsets_free;
2764

2765
	return newvacpage;
B
Bruce Momjian 已提交
2766
}
V
Vadim B. Mikheev 已提交
2767

2768 2769 2770 2771 2772 2773 2774
/*
 * Add a VacPage pointer to a VacPageList.
 *
 *		As a side effect of the way that scan_heap works,
 *		higher pages come after lower pages in the array
 *		(and highest tid on a page is last).
 */
B
Bruce Momjian 已提交
2775 2776
static void
vpage_insert(VacPageList vacpagelist, VacPage vpnew)
V
Vadim B. Mikheev 已提交
2777
{
T
Tatsuo Ishii 已提交
2778
#define PG_NPAGEDESC 1024
V
Vadim B. Mikheev 已提交
2779

B
Bruce Momjian 已提交
2780 2781
	/* allocate a VacPage entry if needed */
	if (vacpagelist->num_pages == 0)
T
Tatsuo Ishii 已提交
2782
	{
B
Bruce Momjian 已提交
2783 2784
		vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
		vacpagelist->num_allocated_pages = PG_NPAGEDESC;
T
Tatsuo Ishii 已提交
2785
	}
B
Bruce Momjian 已提交
2786
	else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
T
Tatsuo Ishii 已提交
2787
	{
B
Bruce Momjian 已提交
2788 2789
		vacpagelist->num_allocated_pages *= 2;
		vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
T
Tatsuo Ishii 已提交
2790
	}
B
Bruce Momjian 已提交
2791 2792
	vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
	(vacpagelist->num_pages)++;
2793 2794
}

2795 2796 2797
/*
 * vac_bsearch: just like standard C library routine bsearch(),
 * except that we first test to see whether the target key is outside
2798
 * the range of the table entries.	This case is handled relatively slowly
2799 2800 2801
 * by the normal binary search algorithm (ie, no faster than any other key)
 * but it occurs often enough in VACUUM to be worth optimizing.
 */
2802
static void *
2803 2804
vac_bsearch(const void *key, const void *base,
			size_t nelem, size_t size,
B
Bruce Momjian 已提交
2805
			int (*compar) (const void *, const void *))
2806
{
2807
	int			res;
2808 2809 2810 2811 2812 2813 2814 2815 2816 2817
	const void *last;

	if (nelem == 0)
		return NULL;
	res = compar(key, base);
	if (res < 0)
		return NULL;
	if (res == 0)
		return (void *) base;
	if (nelem > 1)
2818
	{
2819 2820 2821
		last = (const void *) ((const char *) base + (nelem - 1) * size);
		res = compar(key, last);
		if (res > 0)
2822
			return NULL;
2823 2824
		if (res == 0)
			return (void *) last;
2825
	}
2826 2827 2828
	if (nelem <= 2)
		return NULL;			/* already checked 'em all */
	return bsearch(key, base, nelem, size, compar);
B
Bruce Momjian 已提交
2829
}
2830

2831 2832 2833
/*
 * Comparator routines for use with qsort() and bsearch().
 */
2834
static int
B
Bruce Momjian 已提交
2835
vac_cmp_blk(const void *left, const void *right)
2836
{
2837 2838
	BlockNumber lblk,
				rblk;
2839

B
Bruce Momjian 已提交
2840 2841
	lblk = (*((VacPage *) left))->blkno;
	rblk = (*((VacPage *) right))->blkno;
2842

2843
	if (lblk < rblk)
2844
		return -1;
2845
	if (lblk == rblk)
2846 2847
		return 0;
	return 1;
B
Bruce Momjian 已提交
2848
}
2849

2850
static int
B
Bruce Momjian 已提交
2851
vac_cmp_offno(const void *left, const void *right)
2852
{
2853
	if (*(OffsetNumber *) left < *(OffsetNumber *) right)
2854
		return -1;
2855
	if (*(OffsetNumber *) left == *(OffsetNumber *) right)
2856 2857
		return 0;
	return 1;
B
Bruce Momjian 已提交
2858
}
V
Vadim B. Mikheev 已提交
2859

2860
static int
B
Bruce Momjian 已提交
2861
vac_cmp_vtlinks(const void *left, const void *right)
2862
{
B
Bruce Momjian 已提交
2863 2864
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2865
		return -1;
B
Bruce Momjian 已提交
2866 2867
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2868 2869
		return 1;
	/* bi_hi-es are equal */
B
Bruce Momjian 已提交
2870 2871
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2872
		return -1;
B
Bruce Momjian 已提交
2873 2874
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2875 2876
		return 1;
	/* bi_lo-es are equal */
B
Bruce Momjian 已提交
2877 2878
	if (((VTupleLink) left)->new_tid.ip_posid <
		((VTupleLink) right)->new_tid.ip_posid)
2879
		return -1;
B
Bruce Momjian 已提交
2880 2881
	if (((VTupleLink) left)->new_tid.ip_posid >
		((VTupleLink) right)->new_tid.ip_posid)
2882 2883 2884
		return 1;
	return 0;
}
V
Vadim B. Mikheev 已提交
2885

2886

2887 2888
void
vac_open_indexes(Relation relation, int *nindexes, Relation **Irel)
V
Vadim B. Mikheev 已提交
2889
{
2890 2891 2892
	List	   *indexoidlist,
			   *indexoidscan;
	int			i;
V
Vadim B. Mikheev 已提交
2893

2894
	indexoidlist = RelationGetIndexList(relation);
2895

2896
	*nindexes = length(indexoidlist);
2897

2898 2899
	if (*nindexes > 0)
		*Irel = (Relation *) palloc(*nindexes * sizeof(Relation));
2900 2901
	else
		*Irel = NULL;
2902

2903 2904
	i = 0;
	foreach(indexoidscan, indexoidlist)
2905
	{
2906
		Oid			indexoid = lfirsti(indexoidscan);
V
Vadim B. Mikheev 已提交
2907

2908 2909
		(*Irel)[i] = index_open(indexoid);
		i++;
2910 2911
	}

2912
	freeList(indexoidlist);
B
Bruce Momjian 已提交
2913
}
V
Vadim B. Mikheev 已提交
2914 2915


2916 2917
void
vac_close_indexes(int nindexes, Relation *Irel)
V
Vadim B. Mikheev 已提交
2918
{
2919 2920
	if (Irel == (Relation *) NULL)
		return;
V
Vadim B. Mikheev 已提交
2921

2922 2923
	while (nindexes--)
		index_close(Irel[nindexes]);
2924
	pfree(Irel);
B
Bruce Momjian 已提交
2925
}
V
Vadim B. Mikheev 已提交
2926 2927


2928 2929 2930 2931 2932
/*
 * Is an index partial (ie, could it contain fewer tuples than the heap?)
 */
bool
vac_is_partial_index(Relation indrel)
V
Vadim B. Mikheev 已提交
2933
{
2934
	/*
2935 2936
	 * If the index's AM doesn't support nulls, it's partial for our
	 * purposes
2937
	 */
2938
	if (!indrel->rd_am->amindexnulls)
2939 2940 2941
		return true;

	/* Otherwise, look to see if there's a partial-index predicate */
2942
	return (VARSIZE(&indrel->rd_index->indpred) > VARHDRSZ);
B
Bruce Momjian 已提交
2943
}
2944 2945


2946
static bool
B
Bruce Momjian 已提交
2947
enough_space(VacPage vacpage, Size len)
V
Vadim B. Mikheev 已提交
2948
{
2949
	len = MAXALIGN(len);
2950

B
Bruce Momjian 已提交
2951
	if (len > vacpage->free)
2952
		return false;
2953

2954 2955 2956
	/* if there are free itemid(s) and len <= free_space... */
	if (vacpage->offsets_used < vacpage->offsets_free)
		return true;
2957

2958 2959
	/* noff_used >= noff_free and so we'll have to allocate new itemid */
	if (len + sizeof(ItemIdData) <= vacpage->free)
2960
		return true;
2961

2962
	return false;
B
Bruce Momjian 已提交
2963
}
2964 2965


2966 2967 2968
/*
 * Initialize usage snapshot.
 */
2969 2970
void
vac_init_rusage(VacRUsage *ru0)
2971 2972 2973 2974 2975 2976 2977
{
	struct timezone tz;

	getrusage(RUSAGE_SELF, &ru0->ru);
	gettimeofday(&ru0->tv, &tz);
}

2978 2979 2980 2981 2982 2983
/*
 * Compute elapsed time since ru0 usage snapshot, and format into
 * a displayable string.  Result is in a static string, which is
 * tacky, but no one ever claimed that the Postgres backend is
 * threadable...
 */
2984 2985
const char *
vac_show_rusage(VacRUsage *ru0)
2986
{
2987 2988
	static char result[100];
	VacRUsage	ru1;
2989

2990
	vac_init_rusage(&ru1);
2991

2992 2993 2994 2995 2996 2997
	if (ru1.tv.tv_usec < ru0->tv.tv_usec)
	{
		ru1.tv.tv_sec--;
		ru1.tv.tv_usec += 1000000;
	}
	if (ru1.ru.ru_stime.tv_usec < ru0->ru.ru_stime.tv_usec)
2998
	{
2999 3000
		ru1.ru.ru_stime.tv_sec--;
		ru1.ru.ru_stime.tv_usec += 1000000;
3001
	}
3002
	if (ru1.ru.ru_utime.tv_usec < ru0->ru.ru_utime.tv_usec)
3003
	{
3004 3005
		ru1.ru.ru_utime.tv_sec--;
		ru1.ru.ru_utime.tv_usec += 1000000;
3006 3007 3008
	}

	snprintf(result, sizeof(result),
3009 3010
			 "CPU %d.%02ds/%d.%02du sec elapsed %d.%02d sec.",
			 (int) (ru1.ru.ru_stime.tv_sec - ru0->ru.ru_stime.tv_sec),
3011
	  (int) (ru1.ru.ru_stime.tv_usec - ru0->ru.ru_stime.tv_usec) / 10000,
3012
			 (int) (ru1.ru.ru_utime.tv_sec - ru0->ru.ru_utime.tv_sec),
3013
	  (int) (ru1.ru.ru_utime.tv_usec - ru0->ru.ru_utime.tv_usec) / 10000,
3014 3015
			 (int) (ru1.tv.tv_sec - ru0->tv.tv_sec),
			 (int) (ru1.tv.tv_usec - ru0->tv.tv_usec) / 10000);
3016 3017 3018

	return result;
}