vacuum.c 87.3 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * vacuum.c
4 5 6
 *	  The postgres vacuum cleaner.
 *
 * This file includes the "full" version of VACUUM, as well as control code
7
 * used by all three of full VACUUM, lazy VACUUM, and ANALYZE.	See
8 9
 * vacuumlazy.c and analyze.c for the rest of the code for the latter two.
 *
10
 *
B
Bruce Momjian 已提交
11
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
12
 * Portions Copyright (c) 1994, Regents of the University of California
13 14 15
 *
 *
 * IDENTIFICATION
16
 *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.252 2003/05/02 20:54:33 tgl Exp $
17 18 19
 *
 *-------------------------------------------------------------------------
 */
20 21
#include "postgres.h"

22
#include <unistd.h>
23

24
#include "access/clog.h"
B
Bruce Momjian 已提交
25 26
#include "access/genam.h"
#include "access/heapam.h"
27
#include "access/xlog.h"
B
Bruce Momjian 已提交
28 29
#include "catalog/catalog.h"
#include "catalog/catname.h"
30
#include "catalog/namespace.h"
31
#include "catalog/pg_database.h"
32
#include "catalog/pg_index.h"
B
Bruce Momjian 已提交
33
#include "commands/vacuum.h"
34
#include "executor/executor.h"
B
Bruce Momjian 已提交
35
#include "miscadmin.h"
36
#include "storage/freespace.h"
37
#include "storage/sinval.h"
B
Bruce Momjian 已提交
38
#include "storage/smgr.h"
39
#include "tcop/pquery.h"
40
#include "utils/acl.h"
B
Bruce Momjian 已提交
41
#include "utils/builtins.h"
42
#include "utils/fmgroids.h"
B
Bruce Momjian 已提交
43
#include "utils/inval.h"
44
#include "utils/lsyscache.h"
45
#include "utils/relcache.h"
B
Bruce Momjian 已提交
46
#include "utils/syscache.h"
47 48
#include "pgstat.h"

49 50 51 52 53 54 55

typedef struct VacPageData
{
	BlockNumber blkno;			/* BlockNumber of this Page */
	Size		free;			/* FreeSpace on this Page */
	uint16		offsets_used;	/* Number of OffNums used by vacuum */
	uint16		offsets_free;	/* Number of OffNums free or to be free */
56
	OffsetNumber offsets[1];	/* Array of free OffNums */
57 58 59 60 61 62
} VacPageData;

typedef VacPageData *VacPage;

typedef struct VacPageListData
{
63
	BlockNumber empty_end_pages;	/* Number of "empty" end-pages */
64
	int			num_pages;		/* Number of pages in pagedesc */
65 66
	int			num_allocated_pages;	/* Number of allocated pages in
										 * pagedesc */
67
	VacPage    *pagedesc;		/* Descriptions of pages */
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
} VacPageListData;

typedef VacPageListData *VacPageList;

typedef struct VTupleLinkData
{
	ItemPointerData new_tid;
	ItemPointerData this_tid;
} VTupleLinkData;

typedef VTupleLinkData *VTupleLink;

typedef struct VTupleMoveData
{
	ItemPointerData tid;		/* tuple ID */
	VacPage		vacpage;		/* where to move */
	bool		cleanVpd;		/* clean vacpage before using */
} VTupleMoveData;

typedef VTupleMoveData *VTupleMove;

typedef struct VRelStats
{
91
	BlockNumber rel_pages;
92
	double		rel_tuples;
93 94 95 96 97 98 99 100
	Size		min_tlen;
	Size		max_tlen;
	bool		hasindex;
	int			num_vtlinks;
	VTupleLink	vtlinks;
} VRelStats;


101
static MemoryContext vac_context = NULL;
102

B
Bruce Momjian 已提交
103
static int	elevel = -1;
V
Vadim B. Mikheev 已提交
104

105 106 107
static TransactionId OldestXmin;
static TransactionId FreezeLimit;

108

109
/* non-export function prototypes */
110
static List *getrels(const RangeVar *vacrel, const char *stmttype);
111
static void vac_update_dbstats(Oid dbid,
112 113
				   TransactionId vacuumXID,
				   TransactionId frozenXID);
114
static void vac_truncate_clog(TransactionId vacuumXID,
115
				  TransactionId frozenXID);
T
ARGH!  
Tom Lane 已提交
116
static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind);
117
static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
118
static void scan_heap(VRelStats *vacrelstats, Relation onerel,
119
		  VacPageList vacuum_pages, VacPageList fraged_pages);
120
static void repair_frag(VRelStats *vacrelstats, Relation onerel,
121 122
			VacPageList vacuum_pages, VacPageList fraged_pages,
			int nindexes, Relation *Irel);
123
static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
124
			VacPageList vacpagelist);
125
static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
126
static void vacuum_index(VacPageList vacpagelist, Relation indrel,
127
			 double num_tuples, int keep_tuples);
128
static void scan_index(Relation indrel, double num_tuples);
129
static bool tid_reaped(ItemPointer itemptr, void *state);
130
static bool dummy_tid_reaped(ItemPointer itemptr, void *state);
131
static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
132
			   BlockNumber rel_pages);
133
static VacPage copy_vac_page(VacPage vacpage);
B
Bruce Momjian 已提交
134
static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
135
static void *vac_bsearch(const void *key, const void *base,
136 137
			size_t nelem, size_t size,
			int (*compar) (const void *, const void *));
B
Bruce Momjian 已提交
138 139 140
static int	vac_cmp_blk(const void *left, const void *right);
static int	vac_cmp_offno(const void *left, const void *right);
static int	vac_cmp_vtlinks(const void *left, const void *right);
B
Bruce Momjian 已提交
141
static bool enough_space(VacPage vacpage, Size len);
142 143 144 145 146 147 148 149


/****************************************************************************
 *																			*
 *			Code common to all flavors of VACUUM and ANALYZE				*
 *																			*
 ****************************************************************************
 */
150

151

152 153 154
/*
 * Primary entry point for VACUUM and ANALYZE commands.
 */
155
void
156
vacuum(VacuumStmt *vacstmt)
157
{
158
	const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
159
	MemoryContext anl_context = NULL;
T
ARGH!  
Tom Lane 已提交
160 161 162
	TransactionId initialOldestXmin = InvalidTransactionId;
	TransactionId initialFreezeLimit = InvalidTransactionId;
	bool		all_rels;
163 164 165 166 167 168 169
	List	   *vrl,
			   *cur;

	if (vacstmt->verbose)
		elevel = INFO;
	else
		elevel = DEBUG1;
170

171
	/*
172
	 * We cannot run VACUUM inside a user transaction block; if we were
B
Bruce Momjian 已提交
173 174 175 176 177 178
	 * inside a transaction, then our commit- and
	 * start-transaction-command calls would not have the intended effect!
	 * Furthermore, the forced commit that occurs before truncating the
	 * relation's file would have the effect of committing the rest of the
	 * user's transaction too, which would certainly not be the desired
	 * behavior.
179
	 */
180 181
	if (vacstmt->vacuum)
		PreventTransactionChain((void *) vacstmt, stmttype);
182

183 184 185
	/*
	 * Send info about dead objects to the statistics collector
	 */
186 187
	if (vacstmt->vacuum)
		pgstat_vacuum_tabstat();
188

189 190 191
	/*
	 * Create special memory context for cross-transaction storage.
	 *
192
	 * Since it is a child of PortalContext, it will go away eventually even
B
Bruce Momjian 已提交
193 194
	 * if we suffer an error; there's no need for special abort cleanup
	 * logic.
195
	 */
196
	vac_context = AllocSetContextCreate(PortalContext,
197 198 199 200
										"Vacuum",
										ALLOCSET_DEFAULT_MINSIZE,
										ALLOCSET_DEFAULT_INITSIZE,
										ALLOCSET_DEFAULT_MAXSIZE);
201

202
	/*
B
Bruce Momjian 已提交
203 204 205
	 * If we are running only ANALYZE, we don't need per-table
	 * transactions, but we still need a memory context with table
	 * lifetime.
206
	 */
207
	if (vacstmt->analyze && !vacstmt->vacuum)
208
		anl_context = AllocSetContextCreate(PortalContext,
209 210 211 212 213
											"Analyze",
											ALLOCSET_DEFAULT_MINSIZE,
											ALLOCSET_DEFAULT_INITSIZE,
											ALLOCSET_DEFAULT_MAXSIZE);

T
ARGH!  
Tom Lane 已提交
214 215 216
	/* Assume we are processing everything unless one table is mentioned */
	all_rels = (vacstmt->relation == NULL);

217
	/* Build list of relations to process (note this lives in vac_context) */
218
	vrl = getrels(vacstmt->relation, stmttype);
219

220
	/*
221 222 223
	 * Formerly, there was code here to prevent more than one VACUUM from
	 * executing concurrently in the same database.  However, there's no
	 * good reason to prevent that, and manually removing lockfiles after
B
Bruce Momjian 已提交
224 225
	 * a vacuum crash was a pain for dbadmins.	So, forget about
	 * lockfiles, and just rely on the locks we grab on each target table
226 227 228 229 230
	 * to ensure that there aren't two VACUUMs running on the same table
	 * at the same time.
	 */

	/*
B
Bruce Momjian 已提交
231 232 233 234 235
	 * The strangeness with committing and starting transactions here is
	 * due to wanting to run each table's VACUUM as a separate
	 * transaction, so that we don't hold locks unnecessarily long.  Also,
	 * if we are doing VACUUM ANALYZE, the ANALYZE part runs as a separate
	 * transaction from the VACUUM to further reduce locking.
236 237 238 239 240
	 *
	 * vacuum_rel expects to be entered with no transaction active; it will
	 * start and commit its own transaction.  But we are called by an SQL
	 * command, and so we are executing inside a transaction already.  We
	 * commit the transaction started in PostgresMain() here, and start
B
Bruce Momjian 已提交
241 242
	 * another one before exiting to match the commit waiting for us back
	 * in PostgresMain().
243
	 *
244
	 * In the case of an ANALYZE statement (no vacuum, just analyze) it's
B
Bruce Momjian 已提交
245 246
	 * okay to run the whole thing in the outer transaction, and so we
	 * skip transaction start/stop operations.
247
	 */
248 249
	if (vacstmt->vacuum)
	{
T
ARGH!  
Tom Lane 已提交
250
		if (all_rels)
251 252
		{
			/*
253 254
			 * It's a database-wide VACUUM.
			 *
255 256
			 * Compute the initially applicable OldestXmin and FreezeLimit
			 * XIDs, so that we can record these values at the end of the
B
Bruce Momjian 已提交
257 258 259
			 * VACUUM. Note that individual tables may well be processed
			 * with newer values, but we can guarantee that no
			 * (non-shared) relations are processed with older ones.
260
			 *
B
Bruce Momjian 已提交
261 262 263 264 265 266 267 268 269 270
			 * It is okay to record non-shared values in pg_database, even
			 * though we may vacuum shared relations with older cutoffs,
			 * because only the minimum of the values present in
			 * pg_database matters.  We can be sure that shared relations
			 * have at some time been vacuumed with cutoffs no worse than
			 * the global minimum; for, if there is a backend in some
			 * other DB with xmin = OLDXMIN that's determining the cutoff
			 * with which we vacuum shared relations, it is not possible
			 * for that database to have a cutoff newer than OLDXMIN
			 * recorded in pg_database.
271 272
			 */
			vacuum_set_xid_limits(vacstmt, false,
T
ARGH!  
Tom Lane 已提交
273 274
								  &initialOldestXmin,
								  &initialFreezeLimit);
275 276 277
		}

		/* matches the StartTransaction in PostgresMain() */
278
		CommitTransactionCommand(true);
279
	}
280

281
	/*
282
	 * Loop to process each selected relation.
283
	 */
284
	foreach(cur, vrl)
285
	{
286
		Oid			relid = lfirsto(cur);
287

288
		if (vacstmt->vacuum)
T
ARGH!  
Tom Lane 已提交
289 290 291 292
		{
			if (! vacuum_rel(relid, vacstmt, RELKIND_RELATION))
				all_rels = false; /* forget about updating dbstats */
		}
293
		if (vacstmt->analyze)
294
		{
295 296 297
			MemoryContext old_context = NULL;

			/*
B
Bruce Momjian 已提交
298 299 300 301 302
			 * If we vacuumed, use new transaction for analyze.
			 * Otherwise, we can use the outer transaction, but we still
			 * need to call analyze_rel in a memory context that will be
			 * cleaned up on return (else we leak memory while processing
			 * multiple tables).
303
			 */
304
			if (vacstmt->vacuum)
305
			{
306
				StartTransactionCommand(true);
307 308
				SetQuerySnapshot();	/* might be needed for functional index */
			}
309 310 311
			else
				old_context = MemoryContextSwitchTo(anl_context);

312
			analyze_rel(relid, vacstmt);
313 314

			if (vacstmt->vacuum)
315
				CommitTransactionCommand(true);
316 317 318
			else
			{
				MemoryContextSwitchTo(old_context);
319
				MemoryContextResetAndDeleteChildren(anl_context);
320 321
			}
		}
322
	}
323

324 325 326
	/*
	 * Finish up processing.
	 */
327
	if (vacstmt->vacuum)
328
	{
329
		/* here, we are not in a transaction */
330

331
		/*
B
Bruce Momjian 已提交
332 333 334 335
		 * This matches the CommitTransaction waiting for us in
		 * PostgresMain(). We tell xact.c not to chain the upcoming
		 * commit, so that a VACUUM doesn't start a transaction block,
		 * even when autocommit is off.
336 337
		 */
		StartTransactionCommand(true);
338

339 340 341 342 343 344 345
		/*
		 * If it was a database-wide VACUUM, print FSM usage statistics
		 * (we don't make you be superuser to see these).
		 */
		if (vacstmt->relation == NULL)
			PrintFreeSpaceMapStatistics(elevel);

346
		/*
T
ARGH!  
Tom Lane 已提交
347 348 349
		 * If we completed a database-wide VACUUM without skipping any
		 * relations, update the database's pg_database row with info
		 * about the transaction IDs used, and try to truncate pg_clog.
350
		 */
T
ARGH!  
Tom Lane 已提交
351
		if (all_rels)
352 353 354 355 356
		{
			vac_update_dbstats(MyDatabaseId,
							   initialOldestXmin, initialFreezeLimit);
			vac_truncate_clog(initialOldestXmin, initialFreezeLimit);
		}
357 358
	}

359 360
	/*
	 * Clean up working storage --- note we must do this after
B
Bruce Momjian 已提交
361 362
	 * StartTransactionCommand, else we might be trying to delete the
	 * active context!
363 364 365
	 */
	MemoryContextDelete(vac_context);
	vac_context = NULL;
366

367
	if (anl_context)
368
		MemoryContextDelete(anl_context);
369 370 371
}

/*
372
 * Build a list of Oids for each relation to be processed
373 374 375
 *
 * The list is built in vac_context so that it will survive across our
 * per-relation transactions.
376
 */
377 378
static List *
getrels(const RangeVar *vacrel, const char *stmttype)
379
{
380 381 382 383
	List	   *vrl = NIL;
	MemoryContext oldcontext;

	if (vacrel)
384
	{
385
		/* Process specific relation */
B
Bruce Momjian 已提交
386
		Oid			relid;
387 388 389 390 391

		relid = RangeVarGetRelid(vacrel, false);

		/* Make a relation list entry for this guy */
		oldcontext = MemoryContextSwitchTo(vac_context);
392
		vrl = lappendo(vrl, relid);
393
		MemoryContextSwitchTo(oldcontext);
394 395 396
	}
	else
	{
397 398 399 400 401
		/* Process all plain relations listed in pg_class */
		Relation	pgclass;
		HeapScanDesc scan;
		HeapTuple	tuple;
		ScanKeyData key;
402

403 404 405 406
		ScanKeyEntryInitialize(&key, 0x0,
							   Anum_pg_class_relkind,
							   F_CHAREQ,
							   CharGetDatum(RELKIND_RELATION));
407

408
		pgclass = heap_openr(RelationRelationName, AccessShareLock);
409

410
		scan = heap_beginscan(pgclass, SnapshotNow, 1, &key);
411

412
		while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
413
		{
414 415
			/* Make a relation list entry for this guy */
			oldcontext = MemoryContextSwitchTo(vac_context);
416
			vrl = lappendo(vrl, HeapTupleGetOid(tuple));
417
			MemoryContextSwitchTo(oldcontext);
418
		}
419

420 421
		heap_endscan(scan);
		heap_close(pgclass, AccessShareLock);
422 423
	}

424
	return vrl;
425 426
}

427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
/*
 * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points
 */
void
vacuum_set_xid_limits(VacuumStmt *vacstmt, bool sharedRel,
					  TransactionId *oldestXmin,
					  TransactionId *freezeLimit)
{
	TransactionId limit;

	*oldestXmin = GetOldestXmin(sharedRel);

	Assert(TransactionIdIsNormal(*oldestXmin));

	if (vacstmt->freeze)
	{
		/* FREEZE option: use oldest Xmin as freeze cutoff too */
		limit = *oldestXmin;
	}
	else
	{
		/*
		 * Normal case: freeze cutoff is well in the past, to wit, about
		 * halfway to the wrap horizon
		 */
		limit = GetCurrentTransactionId() - (MaxTransactionId >> 2);
	}

	/*
	 * Be careful not to generate a "permanent" XID
	 */
	if (!TransactionIdIsNormal(limit))
		limit = FirstNormalTransactionId;

	/*
	 * Ensure sane relationship of limits
	 */
	if (TransactionIdFollows(limit, *oldestXmin))
	{
B
Bruce Momjian 已提交
466
		elog(WARNING, "oldest Xmin is far in the past --- close open transactions soon to avoid wraparound problems");
467 468 469 470 471 472
		limit = *oldestXmin;
	}

	*freezeLimit = limit;
}

473

474
/*
475
 *	vac_update_relstats() -- update statistics for one relation
476
 *
477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
 *		Update the whole-relation statistics that are kept in its pg_class
 *		row.  There are additional stats that will be updated if we are
 *		doing ANALYZE, but we always update these stats.  This routine works
 *		for both index and heap relation entries in pg_class.
 *
 *		We violate no-overwrite semantics here by storing new values for the
 *		statistics columns directly into the pg_class tuple that's already on
 *		the page.  The reason for this is that if we updated these tuples in
 *		the usual way, vacuuming pg_class itself wouldn't work very well ---
 *		by the time we got done with a vacuum cycle, most of the tuples in
 *		pg_class would've been obsoleted.  Of course, this only works for
 *		fixed-size never-null columns, but these are.
 *
 *		This routine is shared by full VACUUM, lazy VACUUM, and stand-alone
 *		ANALYZE.
 */
void
vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
					bool hasindex)
{
	Relation	rd;
	HeapTupleData rtup;
	HeapTuple	ctup;
	Form_pg_class pgcform;
	Buffer		buffer;

	/*
	 * update number of tuples and number of pages in pg_class
	 */
	rd = heap_openr(RelationRelationName, RowExclusiveLock);

	ctup = SearchSysCache(RELOID,
						  ObjectIdGetDatum(relid),
						  0, 0, 0);
	if (!HeapTupleIsValid(ctup))
		elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
			 relid);

	/* get the buffer cache tuple */
	rtup.t_self = ctup->t_self;
	ReleaseSysCache(ctup);
518 519 520
	if (!heap_fetch(rd, SnapshotNow, &rtup, &buffer, false, NULL))
		elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
			 relid);
521 522 523 524 525 526

	/* overwrite the existing statistics in the tuple */
	pgcform = (Form_pg_class) GETSTRUCT(&rtup);
	pgcform->relpages = (int32) num_pages;
	pgcform->reltuples = num_tuples;
	pgcform->relhasindex = hasindex;
527

528
	/*
529 530
	 * If we have discovered that there are no indexes, then there's no
	 * primary key either.	This could be done more thoroughly...
531 532 533
	 */
	if (!hasindex)
		pgcform->relhaspkey = false;
534

535 536
	/*
	 * Invalidate the tuple in the catcaches; this also arranges to flush
B
Bruce Momjian 已提交
537 538 539
	 * the relation's relcache entry.  (If we fail to commit for some
	 * reason, no flush will occur, but no great harm is done since there
	 * are no noncritical state updates here.)
540
	 */
541
	CacheInvalidateHeapTuple(rd, &rtup);
542 543

	/* Write the buffer */
544 545 546 547 548 549
	WriteBuffer(buffer);

	heap_close(rd, RowExclusiveLock);
}


550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582
/*
 *	vac_update_dbstats() -- update statistics for one database
 *
 *		Update the whole-database statistics that are kept in its pg_database
 *		row.
 *
 *		We violate no-overwrite semantics here by storing new values for the
 *		statistics columns directly into the tuple that's already on the page.
 *		As with vac_update_relstats, this avoids leaving dead tuples behind
 *		after a VACUUM; which is good since GetRawDatabaseInfo
 *		can get confused by finding dead tuples in pg_database.
 *
 *		This routine is shared by full and lazy VACUUM.  Note that it is only
 *		applied after a database-wide VACUUM operation.
 */
static void
vac_update_dbstats(Oid dbid,
				   TransactionId vacuumXID,
				   TransactionId frozenXID)
{
	Relation	relation;
	ScanKeyData entry[1];
	HeapScanDesc scan;
	HeapTuple	tuple;
	Form_pg_database dbform;

	relation = heap_openr(DatabaseRelationName, RowExclusiveLock);

	/* Must use a heap scan, since there's no syscache for pg_database */
	ScanKeyEntryInitialize(&entry[0], 0x0,
						   ObjectIdAttributeNumber, F_OIDEQ,
						   ObjectIdGetDatum(dbid));

583
	scan = heap_beginscan(relation, SnapshotNow, 1, entry);
584

585
	tuple = heap_getnext(scan, ForwardScanDirection);
586 587 588 589 590 591 592 593 594 595 596

	if (!HeapTupleIsValid(tuple))
		elog(ERROR, "database %u does not exist", dbid);

	dbform = (Form_pg_database) GETSTRUCT(tuple);

	/* overwrite the existing statistics in the tuple */
	dbform->datvacuumxid = vacuumXID;
	dbform->datfrozenxid = frozenXID;

	/* invalidate the tuple in the cache and write the buffer */
597
	CacheInvalidateHeapTuple(relation, tuple);
598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
	WriteNoReleaseBuffer(scan->rs_cbuf);

	heap_endscan(scan);

	heap_close(relation, RowExclusiveLock);
}


/*
 *	vac_truncate_clog() -- attempt to truncate the commit log
 *
 *		Scan pg_database to determine the system-wide oldest datvacuumxid,
 *		and use it to truncate the transaction commit log (pg_clog).
 *		Also generate a warning if the system-wide oldest datfrozenxid
 *		seems to be in danger of wrapping around.
 *
 *		The passed XIDs are simply the ones I just wrote into my pg_database
615
 *		entry.	They're used to initialize the "min" calculations.
616 617 618 619 620 621 622
 *
 *		This routine is shared by full and lazy VACUUM.  Note that it is only
 *		applied after a database-wide VACUUM operation.
 */
static void
vac_truncate_clog(TransactionId vacuumXID, TransactionId frozenXID)
{
623
	TransactionId myXID;
624 625 626 627
	Relation	relation;
	HeapScanDesc scan;
	HeapTuple	tuple;
	int32		age;
628 629 630 631
	bool		vacuumAlreadyWrapped = false;
	bool		frozenAlreadyWrapped = false;

	myXID = GetCurrentTransactionId();
632 633 634

	relation = heap_openr(DatabaseRelationName, AccessShareLock);

635
	scan = heap_beginscan(relation, SnapshotNow, 0, NULL);
636

637
	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
638 639 640 641 642 643 644 645
	{
		Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);

		/* Ignore non-connectable databases (eg, template0) */
		/* It's assumed that these have been frozen correctly */
		if (!dbform->datallowconn)
			continue;

646 647 648 649 650 651 652 653 654 655 656 657 658 659
		if (TransactionIdIsNormal(dbform->datvacuumxid))
		{
			if (TransactionIdPrecedes(myXID, dbform->datvacuumxid))
				vacuumAlreadyWrapped = true;
			else if (TransactionIdPrecedes(dbform->datvacuumxid, vacuumXID))
				vacuumXID = dbform->datvacuumxid;
		}
		if (TransactionIdIsNormal(dbform->datfrozenxid))
		{
			if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
				frozenAlreadyWrapped = true;
			else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
				frozenXID = dbform->datfrozenxid;
		}
660 661 662 663 664 665
	}

	heap_endscan(scan);

	heap_close(relation, AccessShareLock);

666
	/*
B
Bruce Momjian 已提交
667 668
	 * Do not truncate CLOG if we seem to have suffered wraparound
	 * already; the computed minimum XID might be bogus.
669 670 671 672 673 674 675 676
	 */
	if (vacuumAlreadyWrapped)
	{
		elog(WARNING, "Some databases have not been vacuumed in over 2 billion transactions."
			 "\n\tYou may have already suffered transaction-wraparound data loss.");
		return;
	}

677 678 679 680
	/* Truncate CLOG to the oldest vacuumxid */
	TruncateCLOG(vacuumXID);

	/* Give warning about impending wraparound problems */
681 682 683 684 685 686 687 688 689 690 691 692 693 694
	if (frozenAlreadyWrapped)
	{
		elog(WARNING, "Some databases have not been vacuumed in over 1 billion transactions."
			 "\n\tBetter vacuum them soon, or you may have a wraparound failure.");
	}
	else
	{
		age = (int32) (myXID - frozenXID);
		if (age > (int32) ((MaxTransactionId >> 3) * 3))
			elog(WARNING, "Some databases have not been vacuumed in %d transactions."
				 "\n\tBetter vacuum them within %d transactions,"
				 "\n\tor you may have a wraparound failure.",
				 age, (int32) (MaxTransactionId >> 1) - age);
	}
695 696 697
}


698 699 700 701 702 703 704 705 706 707
/****************************************************************************
 *																			*
 *			Code common to both flavors of VACUUM							*
 *																			*
 ****************************************************************************
 */


/*
 *	vacuum_rel() -- vacuum one heap relation
708
 *
T
ARGH!  
Tom Lane 已提交
709 710 711 712 713
 *		Returns TRUE if we actually processed the relation (or can ignore it
 *		for some reason), FALSE if we failed to process it due to permissions
 *		or other reasons.  (A FALSE result really means that some data
 *		may have been left unvacuumed, so we can't update XID stats.)
 *
714 715 716 717 718
 *		Doing one heap at a time incurs extra overhead, since we need to
 *		check that the heap exists again just before we vacuum it.	The
 *		reason that we do this is so that vacuuming can be spread across
 *		many small transactions.  Otherwise, two-phase locking would require
 *		us to lock the entire database during one pass of the vacuum cleaner.
719 720
 *
 *		At entry and exit, we are not inside a transaction.
721
 */
T
ARGH!  
Tom Lane 已提交
722
static bool
723
vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
724
{
725
	LOCKMODE	lmode;
726
	Relation	onerel;
727
	LockRelId	onerelid;
728
	Oid			toast_relid;
T
ARGH!  
Tom Lane 已提交
729
	bool		result;
730

731
	/* Begin a transaction for vacuuming this relation */
732
	StartTransactionCommand(true);
733
	SetQuerySnapshot();			/* might be needed for functional index */
734

735
	/*
B
Bruce Momjian 已提交
736
	 * Check for user-requested abort.	Note we want this to be inside a
B
Bruce Momjian 已提交
737
	 * transaction, so xact.c doesn't issue useless WARNING.
738
	 */
739
	CHECK_FOR_INTERRUPTS();
740

741 742 743 744
	/*
	 * Race condition -- if the pg_class tuple has gone away since the
	 * last time we saw it, we don't need to vacuum it.
	 */
745 746 747
	if (!SearchSysCacheExists(RELOID,
							  ObjectIdGetDatum(relid),
							  0, 0, 0))
748
	{
749
		CommitTransactionCommand(true);
T
ARGH!  
Tom Lane 已提交
750
		return true;			/* okay 'cause no data there */
751 752
	}

753
	/*
754 755
	 * Determine the type of lock we want --- hard exclusive lock for a
	 * FULL vacuum, but just ShareUpdateExclusiveLock for concurrent
756 757
	 * vacuum.	Either way, we can be sure that no other backend is
	 * vacuuming the same table.
758 759 760 761
	 */
	lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock;

	/*
762 763
	 * Open the class, get an appropriate lock on it, and check
	 * permissions.
764
	 *
765 766
	 * We allow the user to vacuum a table if he is superuser, the table
	 * owner, or the database owner (but in the latter case, only if it's
B
Bruce Momjian 已提交
767 768
	 * not a shared relation).	pg_class_ownercheck includes the superuser
	 * case.
769
	 *
B
Bruce Momjian 已提交
770
	 * Note we choose to treat permissions failure as a WARNING and keep
771
	 * trying to vacuum the rest of the DB --- is this appropriate?
772
	 */
773
	onerel = relation_open(relid, lmode);
774

775
	if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) ||
776
		  (is_dbadmin(MyDatabaseId) && !onerel->rd_rel->relisshared)))
777
	{
B
Bruce Momjian 已提交
778
		elog(WARNING, "Skipping \"%s\" --- only table or database owner can VACUUM it",
779
			 RelationGetRelationName(onerel));
780
		relation_close(onerel, lmode);
781
		CommitTransactionCommand(true);
T
ARGH!  
Tom Lane 已提交
782
		return false;
783 784 785 786 787 788 789 790 791 792 793
	}

	/*
	 * Check that it's a plain table; we used to do this in getrels() but
	 * seems safer to check after we've locked the relation.
	 */
	if (onerel->rd_rel->relkind != expected_relkind)
	{
		elog(WARNING, "Skipping \"%s\" --- can not process indexes, views or special system tables",
			 RelationGetRelationName(onerel));
		relation_close(onerel, lmode);
794
		CommitTransactionCommand(true);
T
ARGH!  
Tom Lane 已提交
795
		return false;
796 797
	}

798 799 800 801 802 803 804 805 806 807 808
	/*
	 * Silently ignore tables that are temp tables of other backends ---
	 * trying to vacuum these will lead to great unhappiness, since their
	 * contents are probably not up-to-date on disk.  (We don't throw a
	 * warning here; it would just lead to chatter during a database-wide
	 * VACUUM.)
	 */
	if (isOtherTempNamespace(RelationGetNamespace(onerel)))
	{
		relation_close(onerel, lmode);
		CommitTransactionCommand(true);
T
ARGH!  
Tom Lane 已提交
809
		return true;			/* assume no long-lived data in temp tables */
810 811
	}

812
	/*
813 814 815 816
	 * Get a session-level lock too. This will protect our access to the
	 * relation across multiple transactions, so that we can vacuum the
	 * relation's TOAST table (if any) secure in the knowledge that no one
	 * is deleting the parent relation.
817 818 819 820 821 822
	 *
	 * NOTE: this cannot block, even if someone else is waiting for access,
	 * because the lock manager knows that both lock requests are from the
	 * same process.
	 */
	onerelid = onerel->rd_lockInfo.lockRelId;
823
	LockRelationForSession(&onerelid, lmode);
824 825 826

	/*
	 * Remember the relation's TOAST relation for later
827 828 829
	 */
	toast_relid = onerel->rd_rel->reltoastrelid;

830 831 832 833
	/*
	 * Do the actual work --- either FULL or "lazy" vacuum
	 */
	if (vacstmt->full)
834
		full_vacuum_rel(onerel, vacstmt);
835
	else
836
		lazy_vacuum_rel(onerel, vacstmt);
837

T
ARGH!  
Tom Lane 已提交
838 839
	result = true;				/* did the vacuum */

840
	/* all done with this class, but hold lock until commit */
841
	relation_close(onerel, NoLock);
842 843 844 845

	/*
	 * Complete the transaction and free all temporary memory used.
	 */
846
	CommitTransactionCommand(true);
847 848 849 850

	/*
	 * If the relation has a secondary toast rel, vacuum that too while we
	 * still hold the session lock on the master table.  Note however that
851 852 853
	 * "analyze" will not get done on the toast table.	This is good,
	 * because the toaster always uses hardcoded index access and
	 * statistics are totally unimportant for toast relations.
854 855
	 */
	if (toast_relid != InvalidOid)
T
ARGH!  
Tom Lane 已提交
856 857 858 859
	{
		if (! vacuum_rel(toast_relid, vacstmt, RELKIND_TOASTVALUE))
			result = false;		/* failed to vacuum the TOAST table? */
	}
860 861 862 863 864

	/*
	 * Now release the session-level lock on the master table.
	 */
	UnlockRelationForSession(&onerelid, lmode);
T
ARGH!  
Tom Lane 已提交
865 866

	return result;
867 868 869 870 871 872 873 874 875 876 877 878 879 880
}


/****************************************************************************
 *																			*
 *			Code for VACUUM FULL (only)										*
 *																			*
 ****************************************************************************
 */


/*
 *	full_vacuum_rel() -- perform FULL VACUUM for one heap relation
 *
881
 *		This routine vacuums a single heap, cleans out its indexes, and
882 883 884 885 886 887
 *		updates its num_pages and num_tuples statistics.
 *
 *		At entry, we have already established a transaction and opened
 *		and locked the relation.
 */
static void
888
full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
889 890
{
	VacPageListData vacuum_pages;		/* List of pages to vacuum and/or
891
										 * clean indexes */
892 893 894
	VacPageListData fraged_pages;		/* List of pages with space enough
										 * for re-using */
	Relation   *Irel;
895
	int			nindexes,
896 897 898 899 900
				i;
	VRelStats  *vacrelstats;
	bool		reindex = false;

	if (IsIgnoringSystemIndexes() &&
901
		IsSystemRelation(onerel))
902 903
		reindex = true;

904 905
	vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
						  &OldestXmin, &FreezeLimit);
906

907 908 909
	/*
	 * Set up statistics-gathering machinery.
	 */
910
	vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
911 912
	vacrelstats->rel_pages = 0;
	vacrelstats->rel_tuples = 0;
913
	vacrelstats->hasindex = false;
B
Bruce Momjian 已提交
914

915
	/* scan the heap */
B
Bruce Momjian 已提交
916
	vacuum_pages.num_pages = fraged_pages.num_pages = 0;
B
Bruce Momjian 已提交
917
	scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
918

919 920
	/* Now open all indexes of the relation */
	vac_open_indexes(onerel, &nindexes, &Irel);
H
Hiroshi Inoue 已提交
921 922 923 924
	if (!Irel)
		reindex = false;
	else if (!RelationGetForm(onerel)->relhasindex)
		reindex = true;
925
	if (nindexes > 0)
926
		vacrelstats->hasindex = true;
B
Bruce Momjian 已提交
927

928
#ifdef NOT_USED
929

930
	/*
B
Bruce Momjian 已提交
931 932
	 * reindex in VACUUM is dangerous under WAL. ifdef out until it
	 * becomes safe.
933
	 */
H
Hiroshi Inoue 已提交
934 935
	if (reindex)
	{
936
		vac_close_indexes(nindexes, Irel);
H
Hiroshi Inoue 已提交
937
		Irel = (Relation *) NULL;
938
		activate_indexes_of_a_table(onerel, false);
H
Hiroshi Inoue 已提交
939
	}
940
#endif   /* NOT_USED */
941 942 943 944

	/* Clean/scan index relation(s) */
	if (Irel != (Relation *) NULL)
	{
B
Bruce Momjian 已提交
945
		if (vacuum_pages.num_pages > 0)
946
		{
947
			for (i = 0; i < nindexes; i++)
948
				vacuum_index(&vacuum_pages, Irel[i],
949
							 vacrelstats->rel_tuples, 0);
950 951 952
		}
		else
		{
953 954
			/* just scan indexes to update statistic */
			for (i = 0; i < nindexes; i++)
955
				scan_index(Irel[i], vacrelstats->rel_tuples);
956 957 958
		}
	}

959 960 961 962
	if (fraged_pages.num_pages > 0)
	{
		/* Try to shrink heap */
		repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
963 964
					nindexes, Irel);
		vac_close_indexes(nindexes, Irel);
965
	}
966 967
	else
	{
968
		vac_close_indexes(nindexes, Irel);
969 970 971
		if (vacuum_pages.num_pages > 0)
		{
			/* Clean pages from vacuum_pages list */
B
Bruce Momjian 已提交
972
			vacuum_heap(vacrelstats, onerel, &vacuum_pages);
973 974 975 976 977 978 979 980 981
		}
		else
		{
			/*
			 * Flush dirty pages out to disk.  We must do this even if we
			 * didn't do anything else, because we want to ensure that all
			 * tuples have correct on-row commit status on disk (see
			 * bufmgr.c's comments for FlushRelationBuffers()).
			 */
982
			i = FlushRelationBuffers(onerel, vacrelstats->rel_pages);
983
			if (i < 0)
984
				elog(ERROR, "VACUUM (full_vacuum_rel): FlushRelationBuffers returned %d",
985 986
					 i);
		}
987
	}
988

B
Bruce Momjian 已提交
989
#ifdef NOT_USED
H
Hiroshi Inoue 已提交
990
	if (reindex)
991
		activate_indexes_of_a_table(onerel, true);
992
#endif   /* NOT_USED */
993

994 995 996
	/* update shared free space map with final free space info */
	vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);

997
	/* update statistics in pg_class */
998
	vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
999
						vacrelstats->rel_tuples, vacrelstats->hasindex);
1000 1001
}

1002

1003
/*
B
Bruce Momjian 已提交
1004
 *	scan_heap() -- scan an open heap relation
1005
 *
1006 1007 1008 1009
 *		This routine sets commit status bits, constructs vacuum_pages (list
 *		of pages we need to compact free space on and/or clean indexes of
 *		deleted tuples), constructs fraged_pages (list of pages with free
 *		space that tuples could be moved into), and calculates statistics
1010
 *		on the number of live tuples in the heap.
1011 1012
 */
static void
B
Bruce Momjian 已提交
1013
scan_heap(VRelStats *vacrelstats, Relation onerel,
B
Bruce Momjian 已提交
1014
		  VacPageList vacuum_pages, VacPageList fraged_pages)
1015
{
1016
	BlockNumber nblocks,
1017 1018 1019
				blkno;
	ItemId		itemid;
	Buffer		buf;
B
Bruce Momjian 已提交
1020
	HeapTupleData tuple;
1021 1022 1023 1024 1025 1026
	OffsetNumber offnum,
				maxoff;
	bool		pgchanged,
				tupgone,
				notup;
	char	   *relname;
B
Bruce Momjian 已提交
1027
	VacPage		vacpage,
1028
				vacpagecopy;
1029
	BlockNumber empty_pages,
B
Bruce Momjian 已提交
1030 1031
				new_pages,
				changed_pages,
B
Bruce Momjian 已提交
1032
				empty_end_pages;
1033 1034 1035
	double		num_tuples,
				tups_vacuumed,
				nkeep,
1036
				nunused;
1037
	double		free_size,
B
Bruce Momjian 已提交
1038
				usable_free_size;
1039
	Size		min_tlen = MaxTupleSize;
1040
	Size		max_tlen = 0;
1041
	int			i;
1042
	bool		do_shrinking = true;
1043 1044 1045
	VTupleLink	vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
	int			num_vtlinks = 0;
	int			free_vtlinks = 100;
1046
	VacRUsage	ru0;
1047

1048
	vac_init_rusage(&ru0);
1049

1050
	relname = RelationGetRelationName(onerel);
1051 1052 1053
	elog(elevel, "--Relation %s.%s--",
		 get_namespace_name(RelationGetNamespace(onerel)),
		 relname);
1054

1055
	empty_pages = new_pages = changed_pages = empty_end_pages = 0;
1056
	num_tuples = tups_vacuumed = nkeep = nunused = 0;
1057
	free_size = 0;
1058 1059 1060

	nblocks = RelationGetNumberOfBlocks(onerel);

1061 1062 1063 1064
	/*
	 * We initially create each VacPage item in a maximal-sized workspace,
	 * then copy the workspace into a just-large-enough copy.
	 */
B
Bruce Momjian 已提交
1065
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1066 1067 1068

	for (blkno = 0; blkno < nblocks; blkno++)
	{
1069 1070 1071 1072 1073
		Page		page,
					tempPage = NULL;
		bool		do_reap,
					do_frag;

1074 1075
		CHECK_FOR_INTERRUPTS();

1076 1077
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);
1078

B
Bruce Momjian 已提交
1079
		vacpage->blkno = blkno;
1080
		vacpage->offsets_used = 0;
B
Bruce Momjian 已提交
1081
		vacpage->offsets_free = 0;
1082

1083 1084
		if (PageIsNew(page))
		{
B
Bruce Momjian 已提交
1085
			elog(WARNING, "Rel %s: Uninitialized page %u - fixing",
1086 1087
				 relname, blkno);
			PageInit(page, BufferGetPageSize(buf), 0);
B
Bruce Momjian 已提交
1088
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1089
			free_size += vacpage->free;
B
Bruce Momjian 已提交
1090
			new_pages++;
B
Bruce Momjian 已提交
1091
			empty_end_pages++;
1092 1093 1094
			vacpagecopy = copy_vac_page(vacpage);
			vpage_insert(vacuum_pages, vacpagecopy);
			vpage_insert(fraged_pages, vacpagecopy);
1095 1096
			WriteBuffer(buf);
			continue;
V
Vadim B. Mikheev 已提交
1097
		}
1098 1099

		if (PageIsEmpty(page))
1100
		{
B
Bruce Momjian 已提交
1101
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1102
			free_size += vacpage->free;
B
Bruce Momjian 已提交
1103
			empty_pages++;
B
Bruce Momjian 已提交
1104
			empty_end_pages++;
1105 1106 1107
			vacpagecopy = copy_vac_page(vacpage);
			vpage_insert(vacuum_pages, vacpagecopy);
			vpage_insert(fraged_pages, vacpagecopy);
1108 1109
			ReleaseBuffer(buf);
			continue;
1110 1111
		}

1112 1113 1114 1115 1116 1117
		pgchanged = false;
		notup = true;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
1118
		{
1119 1120
			uint16		sv_infomask;

1121 1122 1123
			itemid = PageGetItemId(page, offnum);

			/*
1124
			 * Collect un-used items too - it's possible to have indexes
1125 1126 1127 1128
			 * pointing here after crash.
			 */
			if (!ItemIdIsUsed(itemid))
			{
B
Bruce Momjian 已提交
1129
				vacpage->offsets[vacpage->offsets_free++] = offnum;
1130
				nunused += 1;
1131 1132 1133
				continue;
			}

1134
			tuple.t_datamcxt = NULL;
1135 1136 1137
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
1138

1139 1140
			tupgone = false;
			sv_infomask = tuple.t_data->t_infomask;
1141

1142
			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin))
1143
			{
1144
				case HEAPTUPLE_DEAD:
1145
					tupgone = true;		/* we can delete the tuple */
1146 1147
					break;
				case HEAPTUPLE_LIVE:
1148

1149
					/*
1150 1151
					 * Tuple is good.  Consider whether to replace its
					 * xmin value with FrozenTransactionId.
1152
					 */
1153 1154
					if (TransactionIdIsNormal(HeapTupleHeaderGetXmin(tuple.t_data)) &&
						TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
1155 1156
											  FreezeLimit))
					{
1157
						HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
T
Tom Lane 已提交
1158 1159
						/* infomask should be okay already */
						Assert(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED);
1160 1161
						pgchanged = true;
					}
1162 1163
					break;
				case HEAPTUPLE_RECENTLY_DEAD:
1164

1165
					/*
1166 1167
					 * If tuple is recently deleted then we must not
					 * remove it from relation.
1168
					 */
1169
					nkeep += 1;
1170

1171 1172 1173 1174 1175
					/*
					 * If we do shrinking and this tuple is updated one
					 * then remember it to construct updated tuple
					 * dependencies.
					 */
1176 1177 1178
					if (do_shrinking &&
						!(ItemPointerEquals(&(tuple.t_self),
											&(tuple.t_data->t_ctid))))
1179 1180 1181 1182
					{
						if (free_vtlinks == 0)
						{
							free_vtlinks = 1000;
B
Bruce Momjian 已提交
1183 1184 1185
							vtlinks = (VTupleLink) repalloc(vtlinks,
										   (free_vtlinks + num_vtlinks) *
												 sizeof(VTupleLinkData));
1186 1187 1188 1189 1190 1191
						}
						vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
						vtlinks[num_vtlinks].this_tid = tuple.t_self;
						free_vtlinks--;
						num_vtlinks++;
					}
1192 1193
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:
1194

1195
					/*
1196 1197
					 * This should not happen, since we hold exclusive
					 * lock on the relation; shouldn't we raise an error?
1198
					 */
B
Bruce Momjian 已提交
1199
					elog(WARNING, "Rel %s: TID %u/%u: InsertTransactionInProgress %u - can't shrink relation",
1200
						 relname, blkno, offnum, HeapTupleHeaderGetXmin(tuple.t_data));
1201 1202 1203
					do_shrinking = false;
					break;
				case HEAPTUPLE_DELETE_IN_PROGRESS:
1204

1205
					/*
1206 1207
					 * This should not happen, since we hold exclusive
					 * lock on the relation; shouldn't we raise an error?
1208
					 */
B
Bruce Momjian 已提交
1209
					elog(WARNING, "Rel %s: TID %u/%u: DeleteTransactionInProgress %u - can't shrink relation",
1210
						 relname, blkno, offnum, HeapTupleHeaderGetXmax(tuple.t_data));
1211 1212 1213 1214 1215
					do_shrinking = false;
					break;
				default:
					elog(ERROR, "Unexpected HeapTupleSatisfiesVacuum result");
					break;
1216 1217
			}

1218 1219 1220 1221
			/* check for hint-bit update by HeapTupleSatisfiesVacuum */
			if (sv_infomask != tuple.t_data->t_infomask)
				pgchanged = true;

1222 1223 1224
			/*
			 * Other checks...
			 */
1225 1226
			if (onerel->rd_rel->relhasoids &&
				!OidIsValid(HeapTupleGetOid(&tuple)))
B
Bruce Momjian 已提交
1227
				elog(WARNING, "Rel %s: TID %u/%u: OID IS INVALID. TUPGONE %d.",
1228
					 relname, blkno, offnum, (int) tupgone);
1229 1230 1231

			if (tupgone)
			{
1232
				ItemId		lpp;
1233

1234 1235 1236 1237 1238
				/*
				 * Here we are building a temporary copy of the page with
				 * dead tuples removed.  Below we will apply
				 * PageRepairFragmentation to the copy, so that we can
				 * determine how much space will be available after
B
Bruce Momjian 已提交
1239
				 * removal of dead tuples.	But note we are NOT changing
1240 1241
				 * the real page yet...
				 */
1242 1243
				if (tempPage == (Page) NULL)
				{
1244
					Size		pageSize;
1245 1246 1247

					pageSize = PageGetPageSize(page);
					tempPage = (Page) palloc(pageSize);
1248
					memcpy(tempPage, page, pageSize);
1249 1250
				}

1251
				/* mark it unused on the temp page */
1252
				lpp = PageGetItemId(tempPage, offnum);
1253 1254
				lpp->lp_flags &= ~LP_USED;

B
Bruce Momjian 已提交
1255
				vacpage->offsets[vacpage->offsets_free++] = offnum;
1256
				tups_vacuumed += 1;
1257 1258 1259
			}
			else
			{
1260
				num_tuples += 1;
1261
				notup = false;
1262 1263 1264 1265
				if (tuple.t_len < min_tlen)
					min_tlen = tuple.t_len;
				if (tuple.t_len > max_tlen)
					max_tlen = tuple.t_len;
1266
			}
1267
		}						/* scan along page */
1268

B
Bruce Momjian 已提交
1269
		if (tempPage != (Page) NULL)
1270 1271
		{
			/* Some tuples are removable; figure free space after removal */
1272
			PageRepairFragmentation(tempPage, NULL);
B
Bruce Momjian 已提交
1273
			vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
1274
			pfree(tempPage);
1275
			do_reap = true;
1276
		}
1277 1278 1279
		else
		{
			/* Just use current available space */
B
Bruce Momjian 已提交
1280
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1281 1282
			/* Need to reap the page if it has ~LP_USED line pointers */
			do_reap = (vacpage->offsets_free > 0);
V
Vadim B. Mikheev 已提交
1283
		}
1284

1285
		free_size += vacpage->free;
1286

1287 1288
		/*
		 * Add the page to fraged_pages if it has a useful amount of free
1289 1290 1291
		 * space.  "Useful" means enough for a minimal-sized tuple. But we
		 * don't know that accurately near the start of the relation, so
		 * add pages unconditionally if they have >= BLCKSZ/10 free space.
1292
		 */
1293
		do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10);
1294 1295

		if (do_reap || do_frag)
1296
		{
1297 1298 1299 1300 1301
			vacpagecopy = copy_vac_page(vacpage);
			if (do_reap)
				vpage_insert(vacuum_pages, vacpagecopy);
			if (do_frag)
				vpage_insert(fraged_pages, vacpagecopy);
1302 1303
		}

1304
		if (notup)
B
Bruce Momjian 已提交
1305
			empty_end_pages++;
1306
		else
B
Bruce Momjian 已提交
1307
			empty_end_pages = 0;
1308 1309 1310 1311 1312 1313 1314 1315

		if (pgchanged)
		{
			WriteBuffer(buf);
			changed_pages++;
		}
		else
			ReleaseBuffer(buf);
1316 1317
	}

B
Bruce Momjian 已提交
1318
	pfree(vacpage);
1319 1320

	/* save stats in the rel list for use later */
1321 1322
	vacrelstats->rel_tuples = num_tuples;
	vacrelstats->rel_pages = nblocks;
B
Bruce Momjian 已提交
1323
	if (num_tuples == 0)
1324 1325 1326 1327
		min_tlen = max_tlen = 0;
	vacrelstats->min_tlen = min_tlen;
	vacrelstats->max_tlen = max_tlen;

B
Bruce Momjian 已提交
1328 1329
	vacuum_pages->empty_end_pages = empty_end_pages;
	fraged_pages->empty_end_pages = empty_end_pages;
1330 1331

	/*
1332 1333 1334
	 * Clear the fraged_pages list if we found we couldn't shrink. Else,
	 * remove any "empty" end-pages from the list, and compute usable free
	 * space = free space in remaining pages.
1335
	 */
1336
	if (do_shrinking)
1337
	{
1338 1339 1340 1341 1342 1343 1344 1345 1346 1347
		Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
		fraged_pages->num_pages -= empty_end_pages;
		usable_free_size = 0;
		for (i = 0; i < fraged_pages->num_pages; i++)
			usable_free_size += fraged_pages->pagedesc[i]->free;
	}
	else
	{
		fraged_pages->num_pages = 0;
		usable_free_size = 0;
V
Vadim B. Mikheev 已提交
1348
	}
1349

1350 1351
	/* don't bother to save vtlinks if we will not call repair_frag */
	if (fraged_pages->num_pages > 0 && num_vtlinks > 0)
1352
	{
B
Bruce Momjian 已提交
1353
		qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
B
Bruce Momjian 已提交
1354
			  vac_cmp_vtlinks);
1355 1356 1357 1358 1359 1360 1361 1362 1363 1364
		vacrelstats->vtlinks = vtlinks;
		vacrelstats->num_vtlinks = num_vtlinks;
	}
	else
	{
		vacrelstats->vtlinks = NULL;
		vacrelstats->num_vtlinks = 0;
		pfree(vtlinks);
	}

1365 1366 1367 1368
	elog(elevel, "Pages %u: Changed %u, reaped %u, Empty %u, New %u; "
		 "Tup %.0f: Vac %.0f, Keep/VTL %.0f/%u, UnUsed %.0f, MinLen %lu, "
		 "MaxLen %lu; Re-using: Free/Avail. Space %.0f/%.0f; "
		 "EndEmpty/Avail. Pages %u/%u.\n\t%s",
B
Bruce Momjian 已提交
1369
		 nblocks, changed_pages, vacuum_pages->num_pages, empty_pages,
B
Bruce Momjian 已提交
1370
		 new_pages, num_tuples, tups_vacuumed,
1371
		 nkeep, vacrelstats->num_vtlinks,
B
Bruce Momjian 已提交
1372
		 nunused, (unsigned long) min_tlen, (unsigned long) max_tlen,
1373
		 free_size, usable_free_size,
B
Bruce Momjian 已提交
1374
		 empty_end_pages, fraged_pages->num_pages,
1375
		 vac_show_rusage(&ru0));
B
Bruce Momjian 已提交
1376
}
V
Vadim B. Mikheev 已提交
1377

1378 1379

/*
B
Bruce Momjian 已提交
1380
 *	repair_frag() -- try to repair relation's fragmentation
1381
 *
1382
 *		This routine marks dead tuples as unused and tries re-use dead space
1383 1384
 *		by moving tuples (and inserting indexes if needed). It constructs
 *		Nvacpagelist list of free-ed pages (moved tuples) and clean indexes
1385 1386 1387
 *		for them after committing (in hack-manner - without losing locks
 *		and freeing memory!) current transaction. It truncates relation
 *		if some end-blocks are gone away.
1388 1389
 */
static void
B
Bruce Momjian 已提交
1390
repair_frag(VRelStats *vacrelstats, Relation onerel,
B
Bruce Momjian 已提交
1391
			VacPageList vacuum_pages, VacPageList fraged_pages,
1392
			int nindexes, Relation *Irel)
1393
{
1394 1395 1396
	TransactionId myXID;
	CommandId	myCID;
	Buffer		buf,
B
Bruce Momjian 已提交
1397
				cur_buffer;
1398
	BlockNumber nblocks,
1399
				blkno;
1400
	BlockNumber last_move_dest_block = 0,
1401
				last_vacuum_block;
1402 1403
	Page		page,
				ToPage = NULL;
1404 1405
	OffsetNumber offnum,
				maxoff,
1406
				newoff,
B
Bruce Momjian 已提交
1407
				max_offset;
1408 1409
	ItemId		itemid,
				newitemid;
B
Bruce Momjian 已提交
1410 1411
	HeapTupleData tuple,
				newtup;
1412
	TupleDesc	tupdesc;
1413 1414 1415 1416
	ResultRelInfo *resultRelInfo;
	EState	   *estate;
	TupleTable	tupleTable;
	TupleTableSlot *slot;
B
Bruce Momjian 已提交
1417 1418
	VacPageListData Nvacpagelist;
	VacPage		cur_page = NULL,
B
Bruce Momjian 已提交
1419
				last_vacuum_page,
B
Bruce Momjian 已提交
1420 1421
				vacpage,
			   *curpage;
B
Bruce Momjian 已提交
1422
	int			cur_item = 0;
1423
	int			i;
B
Bruce Momjian 已提交
1424 1425 1426 1427
	Size		tuple_len;
	int			num_moved,
				num_fraged_pages,
				vacuumed_pages;
B
Bruce Momjian 已提交
1428
	int			checked_moved,
1429 1430
				num_tuples,
				keep_tuples = 0;
1431
	bool		isempty,
1432 1433
				dowrite,
				chain_tuple_moved;
1434
	VacRUsage	ru0;
1435

1436
	vac_init_rusage(&ru0);
1437 1438 1439 1440

	myXID = GetCurrentTransactionId();
	myCID = GetCurrentCommandId();

1441 1442
	tupdesc = RelationGetDescr(onerel);

1443 1444 1445 1446
	/*
	 * We need a ResultRelInfo and an EState so we can use the regular
	 * executor's index-entry-making machinery.
	 */
1447 1448
	estate = CreateExecutorState();

1449 1450 1451
	resultRelInfo = makeNode(ResultRelInfo);
	resultRelInfo->ri_RangeTableIndex = 1;		/* dummy */
	resultRelInfo->ri_RelationDesc = onerel;
1452
	resultRelInfo->ri_TrigDesc = NULL;	/* we don't fire triggers */
1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463

	ExecOpenIndices(resultRelInfo);

	estate->es_result_relations = resultRelInfo;
	estate->es_num_result_relations = 1;
	estate->es_result_relation_info = resultRelInfo;

	/* Set up a dummy tuple table too */
	tupleTable = ExecCreateTupleTable(1);
	slot = ExecAllocTableSlot(tupleTable);
	ExecSetSlotDescriptor(slot, tupdesc, false);
1464

B
Bruce Momjian 已提交
1465 1466
	Nvacpagelist.num_pages = 0;
	num_fraged_pages = fraged_pages->num_pages;
1467
	Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
B
Bruce Momjian 已提交
1468
	vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479
	if (vacuumed_pages > 0)
	{
		/* get last reaped page from vacuum_pages */
		last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
		last_vacuum_block = last_vacuum_page->blkno;
	}
	else
	{
		last_vacuum_page = NULL;
		last_vacuum_block = InvalidBlockNumber;
	}
B
Bruce Momjian 已提交
1480 1481
	cur_buffer = InvalidBuffer;
	num_moved = 0;
1482

B
Bruce Momjian 已提交
1483 1484
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
	vacpage->offsets_used = vacpage->offsets_free = 0;
1485

1486 1487
	/*
	 * Scan pages backwards from the last nonempty page, trying to move
B
Bruce Momjian 已提交
1488
	 * tuples down to lower pages.	Quit when we reach a page that we have
1489 1490 1491
	 * moved any tuples onto, or the first page if we haven't moved
	 * anything, or when we find a page we cannot completely empty (this
	 * last condition is handled by "break" statements within the loop).
1492
	 *
B
Bruce Momjian 已提交
1493
	 * NB: this code depends on the vacuum_pages and fraged_pages lists being
1494
	 * in order by blkno.
1495
	 */
1496
	nblocks = vacrelstats->rel_pages;
B
Bruce Momjian 已提交
1497
	for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1498 1499
		 blkno > last_move_dest_block;
		 blkno--)
V
Vadim B. Mikheev 已提交
1500
	{
1501 1502
		CHECK_FOR_INTERRUPTS();

1503
		/*
1504 1505 1506 1507 1508
		 * Forget fraged_pages pages at or after this one; they're no
		 * longer useful as move targets, since we only want to move down.
		 * Note that since we stop the outer loop at last_move_dest_block,
		 * pages removed here cannot have had anything moved onto them
		 * already.
1509
		 *
1510 1511 1512
		 * Also note that we don't change the stored fraged_pages list, only
		 * our local variable num_fraged_pages; so the forgotten pages are
		 * still available to be loaded into the free space map later.
1513 1514
		 */
		while (num_fraged_pages > 0 &&
1515
			fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno)
1516
		{
1517
			Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0);
1518 1519 1520 1521 1522 1523
			--num_fraged_pages;
		}

		/*
		 * Process this page of relation.
		 */
1524 1525 1526
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);

B
Bruce Momjian 已提交
1527
		vacpage->offsets_free = 0;
1528 1529 1530 1531

		isempty = PageIsEmpty(page);

		dowrite = false;
1532 1533 1534

		/* Is the page in the vacuum_pages list? */
		if (blkno == last_vacuum_block)
1535
		{
1536 1537 1538
			if (last_vacuum_page->offsets_free > 0)
			{
				/* there are dead tuples on this page - clean them */
1539
				Assert(!isempty);
1540 1541 1542
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				vacuum_page(onerel, buf, last_vacuum_page);
				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1543 1544 1545 1546
				dowrite = true;
			}
			else
				Assert(isempty);
B
Bruce Momjian 已提交
1547
			--vacuumed_pages;
1548 1549
			if (vacuumed_pages > 0)
			{
B
Bruce Momjian 已提交
1550
				/* get prev reaped page from vacuum_pages */
B
Bruce Momjian 已提交
1551 1552
				last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
				last_vacuum_block = last_vacuum_page->blkno;
1553 1554
			}
			else
1555
			{
1556
				last_vacuum_page = NULL;
1557
				last_vacuum_block = InvalidBlockNumber;
1558
			}
1559 1560 1561 1562 1563 1564 1565 1566 1567
			if (isempty)
			{
				ReleaseBuffer(buf);
				continue;
			}
		}
		else
			Assert(!isempty);

B
Bruce Momjian 已提交
1568 1569
		chain_tuple_moved = false;		/* no one chain-tuple was moved
										 * off this page, yet */
B
Bruce Momjian 已提交
1570
		vacpage->blkno = blkno;
1571 1572 1573 1574 1575 1576 1577 1578 1579 1580
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			itemid = PageGetItemId(page, offnum);

			if (!ItemIdIsUsed(itemid))
				continue;

1581
			tuple.t_datamcxt = NULL;
1582 1583 1584
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple_len = tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
1585

1586 1587 1588
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
			{
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1589
					elog(ERROR, "HEAP_MOVED_IN was not expected");
B
Bruce Momjian 已提交
1590 1591 1592

				/*
				 * If this (chain) tuple is moved by me already then I
B
Bruce Momjian 已提交
1593 1594
				 * have to check is it in vacpage or not - i.e. is it
				 * moved while cleaning this page or some previous one.
1595 1596 1597
				 */
				if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
1598 1599
					if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
						elog(ERROR, "Invalid XVAC in tuple header");
1600 1601
					if (keep_tuples == 0)
						continue;
B
Bruce Momjian 已提交
1602 1603 1604
					if (chain_tuple_moved)		/* some chains was moved
												 * while */
					{			/* cleaning this page */
B
Bruce Momjian 已提交
1605 1606
						Assert(vacpage->offsets_free > 0);
						for (i = 0; i < vacpage->offsets_free; i++)
1607
						{
B
Bruce Momjian 已提交
1608
							if (vacpage->offsets[i] == offnum)
1609 1610
								break;
						}
B
Bruce Momjian 已提交
1611
						if (i >= vacpage->offsets_free) /* not found */
1612
						{
B
Bruce Momjian 已提交
1613
							vacpage->offsets[vacpage->offsets_free++] = offnum;
1614 1615 1616 1617 1618
							keep_tuples--;
						}
					}
					else
					{
B
Bruce Momjian 已提交
1619
						vacpage->offsets[vacpage->offsets_free++] = offnum;
1620 1621 1622 1623 1624
						keep_tuples--;
					}
					continue;
				}
				elog(ERROR, "HEAP_MOVED_OFF was expected");
1625 1626 1627
			}

			/*
B
Bruce Momjian 已提交
1628 1629 1630
			 * If this tuple is in the chain of tuples created in updates
			 * by "recent" transactions then we have to move all chain of
			 * tuples to another places.
1631
			 *
B
Bruce Momjian 已提交
1632 1633 1634
			 * NOTE: this test is not 100% accurate: it is possible for a
			 * tuple to be an updated one with recent xmin, and yet not
			 * have a corresponding tuple in the vtlinks list.	Presumably
1635 1636
			 * there was once a parent tuple with xmax matching the xmin,
			 * but it's possible that that tuple has been removed --- for
B
Bruce Momjian 已提交
1637 1638 1639
			 * example, if it had xmin = xmax then
			 * HeapTupleSatisfiesVacuum would deem it removable as soon as
			 * the xmin xact completes.
1640 1641
			 *
			 * To be on the safe side, we abandon the repair_frag process if
B
Bruce Momjian 已提交
1642 1643 1644
			 * we cannot find the parent tuple in vtlinks.	This may be
			 * overly conservative; AFAICS it would be safe to move the
			 * chain.
1645
			 */
1646
			if (((tuple.t_data->t_infomask & HEAP_UPDATED) &&
1647
			 !TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
B
Bruce Momjian 已提交
1648
									OldestXmin)) ||
1649 1650
				(!(tuple.t_data->t_infomask & (HEAP_XMAX_INVALID |
											   HEAP_MARKED_FOR_UPDATE)) &&
1651 1652
				 !(ItemPointerEquals(&(tuple.t_self),
									 &(tuple.t_data->t_ctid)))))
1653
			{
B
Bruce Momjian 已提交
1654
				Buffer		Cbuf = buf;
1655 1656
				bool		freeCbuf = false;
				bool		chain_move_failed = false;
B
Bruce Momjian 已提交
1657 1658 1659 1660 1661
				Page		Cpage;
				ItemId		Citemid;
				ItemPointerData Ctid;
				HeapTupleData tp = tuple;
				Size		tlen = tuple_len;
1662 1663 1664
				VTupleMove	vtmove;
				int			num_vtmove;
				int			free_vtmove;
B
Bruce Momjian 已提交
1665
				VacPage		to_vacpage = NULL;
B
Bruce Momjian 已提交
1666 1667
				int			to_item = 0;
				int			ti;
1668 1669 1670 1671 1672 1673

				if (cur_buffer != InvalidBuffer)
				{
					WriteBuffer(cur_buffer);
					cur_buffer = InvalidBuffer;
				}
B
Bruce Momjian 已提交
1674

1675 1676 1677
				/* Quick exit if we have no vtlinks to search in */
				if (vacrelstats->vtlinks == NULL)
				{
1678
					elog(DEBUG1, "Parent item in update-chain not found - can't continue repair_frag");
B
Bruce Momjian 已提交
1679
					break;		/* out of walk-along-page loop */
1680 1681 1682 1683 1684 1685
				}

				vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
				num_vtmove = 0;
				free_vtmove = 100;

1686
				/*
B
Bruce Momjian 已提交
1687 1688
				 * If this tuple is in the begin/middle of the chain then
				 * we have to move to the end of chain.
1689
				 */
1690
				while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
B
Bruce Momjian 已提交
1691
											  HEAP_MARKED_FOR_UPDATE)) &&
1692 1693
					   !(ItemPointerEquals(&(tp.t_self),
										   &(tp.t_data->t_ctid))))
1694 1695 1696 1697 1698
				{
					Ctid = tp.t_data->t_ctid;
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					freeCbuf = true;
B
Bruce Momjian 已提交
1699 1700
					Cbuf = ReadBuffer(onerel,
									  ItemPointerGetBlockNumber(&Ctid));
1701
					Cpage = BufferGetPage(Cbuf);
B
Bruce Momjian 已提交
1702 1703
					Citemid = PageGetItemId(Cpage,
									  ItemPointerGetOffsetNumber(&Ctid));
1704
					if (!ItemIdIsUsed(Citemid))
1705 1706
					{
						/*
B
Bruce Momjian 已提交
1707
						 * This means that in the middle of chain there
1708
						 * was tuple updated by older (than OldestXmin)
B
Bruce Momjian 已提交
1709 1710 1711
						 * xaction and this tuple is already deleted by
						 * me. Actually, upper part of chain should be
						 * removed and seems that this should be handled
B
Bruce Momjian 已提交
1712 1713
						 * in scan_heap(), but it's not implemented at the
						 * moment and so we just stop shrinking here.
1714
						 */
1715
						elog(DEBUG1, "Child itemid in update-chain marked as unused - can't continue repair_frag");
1716 1717
						chain_move_failed = true;
						break;	/* out of loop to move to chain end */
1718
					}
1719
					tp.t_datamcxt = NULL;
1720 1721 1722 1723
					tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
					tp.t_self = Ctid;
					tlen = tp.t_len = ItemIdGetLength(Citemid);
				}
1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734
				if (chain_move_failed)
				{
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					pfree(vtmove);
					break;		/* out of walk-along-page loop */
				}

				/*
				 * Check if all items in chain can be moved
				 */
B
Bruce Momjian 已提交
1735
				for (;;)
1736
				{
1737 1738 1739 1740 1741 1742 1743
					Buffer		Pbuf;
					Page		Ppage;
					ItemId		Pitemid;
					HeapTupleData Ptp;
					VTupleLinkData vtld,
							   *vtlp;

B
Bruce Momjian 已提交
1744 1745
					if (to_vacpage == NULL ||
						!enough_space(to_vacpage, tlen))
1746 1747 1748
					{
						for (i = 0; i < num_fraged_pages; i++)
						{
B
Bruce Momjian 已提交
1749
							if (enough_space(fraged_pages->pagedesc[i], tlen))
1750 1751
								break;
						}
B
Bruce Momjian 已提交
1752 1753

						if (i == num_fraged_pages)
B
Bruce Momjian 已提交
1754
						{
1755
							/* can't move item anywhere */
1756
							chain_move_failed = true;
B
Bruce Momjian 已提交
1757
							break;		/* out of check-all-items loop */
1758 1759
						}
						to_item = i;
B
Bruce Momjian 已提交
1760
						to_vacpage = fraged_pages->pagedesc[to_item];
1761
					}
B
Bruce Momjian 已提交
1762 1763
					to_vacpage->free -= MAXALIGN(tlen);
					if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
1764
						to_vacpage->free -= sizeof(ItemIdData);
B
Bruce Momjian 已提交
1765
					(to_vacpage->offsets_used)++;
1766 1767 1768
					if (free_vtmove == 0)
					{
						free_vtmove = 1000;
1769 1770 1771 1772
						vtmove = (VTupleMove)
							repalloc(vtmove,
									 (free_vtmove + num_vtmove) *
									 sizeof(VTupleMoveData));
1773 1774
					}
					vtmove[num_vtmove].tid = tp.t_self;
B
Bruce Momjian 已提交
1775 1776
					vtmove[num_vtmove].vacpage = to_vacpage;
					if (to_vacpage->offsets_used == 1)
1777 1778 1779 1780 1781
						vtmove[num_vtmove].cleanVpd = true;
					else
						vtmove[num_vtmove].cleanVpd = false;
					free_vtmove--;
					num_vtmove++;
B
Bruce Momjian 已提交
1782

1783
					/* At beginning of chain? */
B
Bruce Momjian 已提交
1784
					if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
B
Bruce Momjian 已提交
1785 1786
						TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data),
											  OldestXmin))
1787
						break;
B
Bruce Momjian 已提交
1788

1789 1790 1791 1792 1793 1794 1795 1796 1797
					/* No, move to tuple with prior row version */
					vtld.new_tid = tp.t_self;
					vtlp = (VTupleLink)
						vac_bsearch((void *) &vtld,
									(void *) (vacrelstats->vtlinks),
									vacrelstats->num_vtlinks,
									sizeof(VTupleLinkData),
									vac_cmp_vtlinks);
					if (vtlp == NULL)
1798
					{
1799
						/* see discussion above */
1800
						elog(DEBUG1, "Parent item in update-chain not found - can't continue repair_frag");
1801
						chain_move_failed = true;
B
Bruce Momjian 已提交
1802
						break;	/* out of check-all-items loop */
1803 1804 1805
					}
					tp.t_self = vtlp->this_tid;
					Pbuf = ReadBuffer(onerel,
B
Bruce Momjian 已提交
1806
								ItemPointerGetBlockNumber(&(tp.t_self)));
1807 1808
					Ppage = BufferGetPage(Pbuf);
					Pitemid = PageGetItemId(Ppage,
B
Bruce Momjian 已提交
1809
							   ItemPointerGetOffsetNumber(&(tp.t_self)));
1810 1811 1812 1813 1814 1815 1816 1817 1818
					/* this can't happen since we saw tuple earlier: */
					if (!ItemIdIsUsed(Pitemid))
						elog(ERROR, "Parent itemid marked as unused");
					Ptp.t_datamcxt = NULL;
					Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);

					/* ctid should not have changed since we saved it */
					Assert(ItemPointerEquals(&(vtld.new_tid),
											 &(Ptp.t_data->t_ctid)));
B
Bruce Momjian 已提交
1819

1820
					/*
B
Bruce Momjian 已提交
1821 1822 1823 1824 1825 1826 1827 1828 1829 1830
					 * Read above about cases when !ItemIdIsUsed(Citemid)
					 * (child item is removed)... Due to the fact that at
					 * the moment we don't remove unuseful part of
					 * update-chain, it's possible to get too old parent
					 * row here. Like as in the case which caused this
					 * problem, we stop shrinking here. I could try to
					 * find real parent row but want not to do it because
					 * of real solution will be implemented anyway, later,
					 * and we are too close to 6.5 release. - vadim
					 * 06/11/99
1831 1832
					 */
					if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(Ptp.t_data),
B
Bruce Momjian 已提交
1833
									 HeapTupleHeaderGetXmin(tp.t_data))))
1834 1835
					{
						ReleaseBuffer(Pbuf);
1836
						elog(DEBUG1, "Too old parent tuple found - can't continue repair_frag");
1837
						chain_move_failed = true;
B
Bruce Momjian 已提交
1838
						break;	/* out of check-all-items loop */
1839
					}
1840 1841 1842 1843 1844 1845 1846
					tp.t_datamcxt = Ptp.t_datamcxt;
					tp.t_data = Ptp.t_data;
					tlen = tp.t_len = ItemIdGetLength(Pitemid);
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					Cbuf = Pbuf;
					freeCbuf = true;
B
Bruce Momjian 已提交
1847
				}				/* end of check-all-items loop */
1848

1849 1850
				if (freeCbuf)
					ReleaseBuffer(Cbuf);
1851 1852 1853
				freeCbuf = false;

				if (chain_move_failed)
1854
				{
1855
					/*
B
Bruce Momjian 已提交
1856 1857 1858
					 * Undo changes to offsets_used state.	We don't
					 * bother cleaning up the amount-free state, since
					 * we're not going to do any further tuple motion.
1859 1860 1861 1862 1863 1864
					 */
					for (i = 0; i < num_vtmove; i++)
					{
						Assert(vtmove[i].vacpage->offsets_used > 0);
						(vtmove[i].vacpage->offsets_used)--;
					}
1865
					pfree(vtmove);
1866
					break;		/* out of walk-along-page loop */
1867
				}
1868 1869 1870 1871

				/*
				 * Okay, move the whle tuple chain
				 */
1872 1873 1874
				ItemPointerSetInvalid(&Ctid);
				for (ti = 0; ti < num_vtmove; ti++)
				{
B
Bruce Momjian 已提交
1875
					VacPage		destvacpage = vtmove[ti].vacpage;
1876

V
Vadim B. Mikheev 已提交
1877
					/* Get page to move from */
1878
					tuple.t_self = vtmove[ti].tid;
B
Bruce Momjian 已提交
1879 1880
					Cbuf = ReadBuffer(onerel,
							 ItemPointerGetBlockNumber(&(tuple.t_self)));
V
Vadim B. Mikheev 已提交
1881 1882 1883 1884 1885 1886 1887 1888 1889

					/* Get page to move to */
					cur_buffer = ReadBuffer(onerel, destvacpage->blkno);

					LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
					if (cur_buffer != Cbuf)
						LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);

					ToPage = BufferGetPage(cur_buffer);
1890
					Cpage = BufferGetPage(Cbuf);
V
Vadim B. Mikheev 已提交
1891

B
Bruce Momjian 已提交
1892
					Citemid = PageGetItemId(Cpage,
1893
							ItemPointerGetOffsetNumber(&(tuple.t_self)));
1894
					tuple.t_datamcxt = NULL;
1895 1896
					tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
					tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
1897 1898 1899 1900 1901 1902 1903

					/*
					 * make a copy of the source tuple, and then mark the
					 * source tuple MOVED_OFF.
					 */
					heap_copytuple_with_tuple(&tuple, &newtup);

1904 1905 1906 1907
					/*
					 * register invalidation of source tuple in catcaches.
					 */
					CacheInvalidateHeapTuple(onerel, &tuple);
1908

1909 1910 1911
					/* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
					START_CRIT_SECTION();

1912 1913 1914
					tuple.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
												  HEAP_XMIN_INVALID |
												  HEAP_MOVED_IN);
1915
					tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1916
					HeapTupleHeaderSetXvac(tuple.t_data, myXID);
1917

1918 1919 1920
					/*
					 * If this page was not used before - clean it.
					 *
1921 1922
					 * NOTE: a nasty bug used to lurk here.  It is possible
					 * for the source and destination pages to be the same
B
Bruce Momjian 已提交
1923 1924 1925 1926 1927 1928 1929
					 * (since this tuple-chain member can be on a page
					 * lower than the one we're currently processing in
					 * the outer loop).  If that's true, then after
					 * vacuum_page() the source tuple will have been
					 * moved, and tuple.t_data will be pointing at
					 * garbage.  Therefore we must do everything that uses
					 * tuple.t_data BEFORE this step!!
1930
					 *
1931
					 * This path is different from the other callers of
B
Bruce Momjian 已提交
1932 1933
					 * vacuum_page, because we have already incremented
					 * the vacpage's offsets_used field to account for the
1934
					 * tuple(s) we expect to move onto the page. Therefore
B
Bruce Momjian 已提交
1935 1936 1937 1938
					 * vacuum_page's check for offsets_used == 0 is wrong.
					 * But since that's a good debugging check for all
					 * other callers, we work around it here rather than
					 * remove it.
1939
					 */
V
Vadim B. Mikheev 已提交
1940
					if (!PageIsEmpty(ToPage) && vtmove[ti].cleanVpd)
1941
					{
B
Bruce Momjian 已提交
1942
						int			sv_offsets_used = destvacpage->offsets_used;
1943

B
Bruce Momjian 已提交
1944
						destvacpage->offsets_used = 0;
1945
						vacuum_page(onerel, cur_buffer, destvacpage);
B
Bruce Momjian 已提交
1946
						destvacpage->offsets_used = sv_offsets_used;
1947
					}
1948 1949 1950 1951 1952

					/*
					 * Update the state of the copied tuple, and store it
					 * on the destination page.
					 */
1953 1954 1955
					newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
												   HEAP_XMIN_INVALID |
												   HEAP_MOVED_OFF);
1956
					newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1957
					HeapTupleHeaderSetXvac(newtup.t_data, myXID);
1958 1959 1960 1961 1962
					newoff = PageAddItem(ToPage,
										 (Item) newtup.t_data,
										 tuple_len,
										 InvalidOffsetNumber,
										 LP_USED);
1963 1964
					if (newoff == InvalidOffsetNumber)
					{
1965
						elog(PANIC, "moving chain: failed to add item with len = %lu to page %u",
B
Bruce Momjian 已提交
1966
						  (unsigned long) tuple_len, destvacpage->blkno);
1967 1968 1969
					}
					newitemid = PageGetItemId(ToPage, newoff);
					pfree(newtup.t_data);
1970
					newtup.t_datamcxt = NULL;
1971
					newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
B
Bruce Momjian 已提交
1972
					ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
V
Vadim B. Mikheev 已提交
1973

1974 1975
					/* XLOG stuff */
					if (!onerel->rd_istemp)
V
Vadim B. Mikheev 已提交
1976
					{
B
Bruce Momjian 已提交
1977 1978 1979
						XLogRecPtr	recptr =
						log_heap_move(onerel, Cbuf, tuple.t_self,
									  cur_buffer, &newtup);
V
Vadim B. Mikheev 已提交
1980 1981 1982 1983 1984 1985 1986 1987 1988

						if (Cbuf != cur_buffer)
						{
							PageSetLSN(Cpage, recptr);
							PageSetSUI(Cpage, ThisStartUpID);
						}
						PageSetLSN(ToPage, recptr);
						PageSetSUI(ToPage, ThisStartUpID);
					}
1989 1990
					else
					{
B
Bruce Momjian 已提交
1991 1992 1993 1994
						/*
						 * No XLOG record, but still need to flag that XID
						 * exists on disk
						 */
1995 1996 1997
						MyXactMadeTempRelUpdate = true;
					}

1998
					END_CRIT_SECTION();
V
Vadim B. Mikheev 已提交
1999

2000
					if (destvacpage->blkno > last_move_dest_block)
B
Bruce Momjian 已提交
2001
						last_move_dest_block = destvacpage->blkno;
B
Bruce Momjian 已提交
2002

2003
					/*
2004
					 * Set new tuple's t_ctid pointing to itself for last
B
Bruce Momjian 已提交
2005 2006
					 * tuple in chain, and to next tuple in chain
					 * otherwise.
2007 2008 2009 2010 2011 2012 2013 2014
					 */
					if (!ItemPointerIsValid(&Ctid))
						newtup.t_data->t_ctid = newtup.t_self;
					else
						newtup.t_data->t_ctid = Ctid;
					Ctid = newtup.t_self;

					num_moved++;
B
Bruce Momjian 已提交
2015

2016 2017 2018 2019
					/*
					 * Remember that we moved tuple from the current page
					 * (corresponding index tuple will be cleaned).
					 */
2020
					if (Cbuf == buf)
B
Bruce Momjian 已提交
2021
						vacpage->offsets[vacpage->offsets_free++] =
B
Bruce Momjian 已提交
2022
							ItemPointerGetOffsetNumber(&(tuple.t_self));
2023 2024
					else
						keep_tuples++;
2025

V
Vadim B. Mikheev 已提交
2026 2027 2028 2029
					LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
					if (cur_buffer != Cbuf)
						LockBuffer(Cbuf, BUFFER_LOCK_UNLOCK);

2030 2031
					/* Create index entries for the moved tuple */
					if (resultRelInfo->ri_NumIndices > 0)
2032
					{
2033 2034 2035
						ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
						ExecInsertIndexTuples(slot, &(newtup.t_self),
											  estate, true);
2036
					}
2037

2038
					WriteBuffer(cur_buffer);
2039
					WriteBuffer(Cbuf);
B
Bruce Momjian 已提交
2040
				}				/* end of move-the-tuple-chain loop */
2041

2042 2043
				cur_buffer = InvalidBuffer;
				pfree(vtmove);
2044
				chain_tuple_moved = true;
2045 2046

				/* advance to next tuple in walk-along-page loop */
2047
				continue;
B
Bruce Momjian 已提交
2048
			}					/* end of is-tuple-in-chain test */
2049

2050
			/* try to find new page for this tuple */
B
Bruce Momjian 已提交
2051
			if (cur_buffer == InvalidBuffer ||
B
Bruce Momjian 已提交
2052
				!enough_space(cur_page, tuple_len))
2053
			{
B
Bruce Momjian 已提交
2054
				if (cur_buffer != InvalidBuffer)
2055
				{
B
Bruce Momjian 已提交
2056 2057
					WriteBuffer(cur_buffer);
					cur_buffer = InvalidBuffer;
2058
				}
B
Bruce Momjian 已提交
2059
				for (i = 0; i < num_fraged_pages; i++)
2060
				{
B
Bruce Momjian 已提交
2061
					if (enough_space(fraged_pages->pagedesc[i], tuple_len))
2062 2063
						break;
				}
B
Bruce Momjian 已提交
2064
				if (i == num_fraged_pages)
2065
					break;		/* can't move item anywhere */
B
Bruce Momjian 已提交
2066
				cur_item = i;
B
Bruce Momjian 已提交
2067 2068
				cur_page = fraged_pages->pagedesc[cur_item];
				cur_buffer = ReadBuffer(onerel, cur_page->blkno);
V
Vadim B. Mikheev 已提交
2069
				LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
B
Bruce Momjian 已提交
2070
				ToPage = BufferGetPage(cur_buffer);
2071
				/* if this page was not used before - clean it */
B
Bruce Momjian 已提交
2072
				if (!PageIsEmpty(ToPage) && cur_page->offsets_used == 0)
2073
					vacuum_page(onerel, cur_buffer, cur_page);
2074
			}
V
Vadim B. Mikheev 已提交
2075 2076 2077 2078
			else
				LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);

			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2079 2080

			/* copy tuple */
2081
			heap_copytuple_with_tuple(&tuple, &newtup);
2082

2083 2084 2085
			/*
			 * register invalidation of source tuple in catcaches.
			 *
B
Bruce Momjian 已提交
2086 2087 2088
			 * (Note: we do not need to register the copied tuple, because we
			 * are not changing the tuple contents and so there cannot be
			 * any need to flush negative catcache entries.)
2089 2090
			 */
			CacheInvalidateHeapTuple(onerel, &tuple);
2091

2092 2093 2094
			/* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
			START_CRIT_SECTION();

B
Bruce Momjian 已提交
2095
			/*
2096
			 * Mark new tuple as MOVED_IN by me.
2097
			 */
2098 2099 2100
			newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
										   HEAP_XMIN_INVALID |
										   HEAP_MOVED_OFF);
2101
			newtup.t_data->t_infomask |= HEAP_MOVED_IN;
2102
			HeapTupleHeaderSetXvac(newtup.t_data, myXID);
2103 2104

			/* add tuple to the page */
2105
			newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
2106 2107 2108
								 InvalidOffsetNumber, LP_USED);
			if (newoff == InvalidOffsetNumber)
			{
2109
				elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
2110 2111
					 (unsigned long) tuple_len,
					 cur_page->blkno, (unsigned long) cur_page->free,
B
Bruce Momjian 已提交
2112
					 cur_page->offsets_used, cur_page->offsets_free);
2113 2114
			}
			newitemid = PageGetItemId(ToPage, newoff);
2115
			pfree(newtup.t_data);
2116
			newtup.t_datamcxt = NULL;
2117
			newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
B
Bruce Momjian 已提交
2118
			ItemPointerSet(&(newtup.t_data->t_ctid), cur_page->blkno, newoff);
2119
			newtup.t_self = newtup.t_data->t_ctid;
2120

B
Bruce Momjian 已提交
2121
			/*
2122
			 * Mark old tuple as MOVED_OFF by me.
2123
			 */
2124 2125 2126
			tuple.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
										  HEAP_XMIN_INVALID |
										  HEAP_MOVED_IN);
2127
			tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
2128
			HeapTupleHeaderSetXvac(tuple.t_data, myXID);
2129

2130 2131
			/* XLOG stuff */
			if (!onerel->rd_istemp)
V
Vadim B. Mikheev 已提交
2132
			{
B
Bruce Momjian 已提交
2133 2134 2135
				XLogRecPtr	recptr =
				log_heap_move(onerel, buf, tuple.t_self,
							  cur_buffer, &newtup);
V
Vadim B. Mikheev 已提交
2136 2137 2138 2139 2140 2141

				PageSetLSN(page, recptr);
				PageSetSUI(page, ThisStartUpID);
				PageSetLSN(ToPage, recptr);
				PageSetSUI(ToPage, ThisStartUpID);
			}
2142 2143
			else
			{
B
Bruce Momjian 已提交
2144 2145 2146 2147
				/*
				 * No XLOG record, but still need to flag that XID exists
				 * on disk
				 */
2148 2149 2150
				MyXactMadeTempRelUpdate = true;
			}

2151
			END_CRIT_SECTION();
V
Vadim B. Mikheev 已提交
2152

B
Bruce Momjian 已提交
2153
			cur_page->offsets_used++;
B
Bruce Momjian 已提交
2154
			num_moved++;
B
Bruce Momjian 已提交
2155
			cur_page->free = ((PageHeader) ToPage)->pd_upper - ((PageHeader) ToPage)->pd_lower;
2156
			if (cur_page->blkno > last_move_dest_block)
B
Bruce Momjian 已提交
2157
				last_move_dest_block = cur_page->blkno;
2158

B
Bruce Momjian 已提交
2159
			vacpage->offsets[vacpage->offsets_free++] = offnum;
2160

V
Vadim B. Mikheev 已提交
2161 2162 2163
			LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);

2164
			/* insert index' tuples if needed */
2165
			if (resultRelInfo->ri_NumIndices > 0)
2166
			{
2167 2168
				ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
				ExecInsertIndexTuples(slot, &(newtup.t_self), estate, true);
2169
			}
B
Bruce Momjian 已提交
2170
		}						/* walk along page */
2171

2172
		/*
B
Bruce Momjian 已提交
2173 2174
		 * If we broke out of the walk-along-page loop early (ie, still
		 * have offnum <= maxoff), then we failed to move some tuple off
2175 2176 2177
		 * this page.  No point in shrinking any more, so clean up and
		 * exit the per-page loop.
		 */
2178 2179
		if (offnum < maxoff && keep_tuples > 0)
		{
B
Bruce Momjian 已提交
2180
			OffsetNumber off;
2181

2182
			/*
B
Bruce Momjian 已提交
2183 2184
			 * Fix vacpage state for any unvisited tuples remaining on
			 * page
2185
			 */
2186
			for (off = OffsetNumberNext(offnum);
B
Bruce Momjian 已提交
2187 2188
				 off <= maxoff;
				 off = OffsetNumberNext(off))
2189 2190 2191 2192
			{
				itemid = PageGetItemId(page, off);
				if (!ItemIdIsUsed(itemid))
					continue;
2193
				tuple.t_datamcxt = NULL;
2194 2195 2196 2197 2198 2199 2200
				tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
				if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)
					continue;
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
					elog(ERROR, "HEAP_MOVED_IN was not expected (2)");
				if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
2201 2202
					if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
						elog(ERROR, "Invalid XVAC in tuple header (4)");
B
Bruce Momjian 已提交
2203
					/* some chains was moved while */
B
Bruce Momjian 已提交
2204 2205
					if (chain_tuple_moved)
					{			/* cleaning this page */
B
Bruce Momjian 已提交
2206 2207
						Assert(vacpage->offsets_free > 0);
						for (i = 0; i < vacpage->offsets_free; i++)
2208
						{
B
Bruce Momjian 已提交
2209
							if (vacpage->offsets[i] == off)
2210 2211
								break;
						}
B
Bruce Momjian 已提交
2212
						if (i >= vacpage->offsets_free) /* not found */
2213
						{
B
Bruce Momjian 已提交
2214
							vacpage->offsets[vacpage->offsets_free++] = off;
2215 2216 2217 2218 2219 2220
							Assert(keep_tuples > 0);
							keep_tuples--;
						}
					}
					else
					{
B
Bruce Momjian 已提交
2221
						vacpage->offsets[vacpage->offsets_free++] = off;
2222 2223 2224 2225
						Assert(keep_tuples > 0);
						keep_tuples--;
					}
				}
2226 2227
				else
					elog(ERROR, "HEAP_MOVED_OFF was expected (2)");
2228 2229 2230
			}
		}

B
Bruce Momjian 已提交
2231
		if (vacpage->offsets_free > 0)	/* some tuples were moved */
V
Vadim B. Mikheev 已提交
2232
		{
2233 2234
			if (chain_tuple_moved)		/* else - they are ordered */
			{
B
Bruce Momjian 已提交
2235
				qsort((char *) (vacpage->offsets), vacpage->offsets_free,
B
Bruce Momjian 已提交
2236
					  sizeof(OffsetNumber), vac_cmp_offno);
2237
			}
2238
			vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
2239
			WriteBuffer(buf);
V
Vadim B. Mikheev 已提交
2240
		}
2241 2242 2243 2244
		else if (dowrite)
			WriteBuffer(buf);
		else
			ReleaseBuffer(buf);
V
Vadim B. Mikheev 已提交
2245

2246
		if (offnum <= maxoff)
2247
			break;				/* had to quit early, see above note */
2248 2249 2250 2251 2252

	}							/* walk along relation */

	blkno++;					/* new number of blocks */

B
Bruce Momjian 已提交
2253
	if (cur_buffer != InvalidBuffer)
V
Vadim B. Mikheev 已提交
2254
	{
B
Bruce Momjian 已提交
2255 2256
		Assert(num_moved > 0);
		WriteBuffer(cur_buffer);
V
Vadim B. Mikheev 已提交
2257
	}
2258

B
Bruce Momjian 已提交
2259
	if (num_moved > 0)
V
Vadim B. Mikheev 已提交
2260
	{
2261
		/*
2262 2263 2264 2265
		 * We have to commit our tuple movings before we truncate the
		 * relation.  Ideally we should do Commit/StartTransactionCommand
		 * here, relying on the session-level table lock to protect our
		 * exclusive access to the relation.  However, that would require
2266
		 * a lot of extra code to close and re-open the relation, indexes,
B
Bruce Momjian 已提交
2267 2268
		 * etc.  For now, a quick hack: record status of current
		 * transaction as committed, and continue.
2269
		 */
V
Vadim B. Mikheev 已提交
2270
		RecordTransactionCommit();
V
Vadim B. Mikheev 已提交
2271
	}
2272 2273

	/*
2274 2275
	 * We are not going to move any more tuples across pages, but we still
	 * need to apply vacuum_page to compact free space in the remaining
2276 2277 2278
	 * pages in vacuum_pages list.	Note that some of these pages may also
	 * be in the fraged_pages list, and may have had tuples moved onto
	 * them; if so, we already did vacuum_page and needn't do it again.
2279
	 */
2280 2281 2282
	for (i = 0, curpage = vacuum_pages->pagedesc;
		 i < vacuumed_pages;
		 i++, curpage++)
V
Vadim B. Mikheev 已提交
2283
	{
2284
		CHECK_FOR_INTERRUPTS();
2285
		Assert((*curpage)->blkno < blkno);
2286
		if ((*curpage)->offsets_used == 0)
2287
		{
2288 2289 2290 2291
			/* this page was not used as a move target, so must clean it */
			buf = ReadBuffer(onerel, (*curpage)->blkno);
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			page = BufferGetPage(buf);
2292
			if (!PageIsEmpty(page))
2293
				vacuum_page(onerel, buf, *curpage);
2294 2295
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
			WriteBuffer(buf);
2296
		}
2297 2298 2299
	}

	/*
2300 2301 2302
	 * Now scan all the pages that we moved tuples onto and update tuple
	 * status bits.  This is not really necessary, but will save time for
	 * future transactions examining these tuples.
2303
	 *
2304
	 * XXX NOTICE that this code fails to clear HEAP_MOVED_OFF tuples from
2305 2306 2307 2308
	 * pages that were move source pages but not move dest pages.  One
	 * also wonders whether it wouldn't be better to skip this step and
	 * let the tuple status updates happen someplace that's not holding an
	 * exclusive lock on the relation.
2309 2310 2311 2312 2313 2314
	 */
	checked_moved = 0;
	for (i = 0, curpage = fraged_pages->pagedesc;
		 i < num_fraged_pages;
		 i++, curpage++)
	{
2315
		CHECK_FOR_INTERRUPTS();
2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328
		Assert((*curpage)->blkno < blkno);
		if ((*curpage)->blkno > last_move_dest_block)
			break;				/* no need to scan any further */
		if ((*curpage)->offsets_used == 0)
			continue;			/* this page was never used as a move dest */
		buf = ReadBuffer(onerel, (*curpage)->blkno);
		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
		page = BufferGetPage(buf);
		num_tuples = 0;
		max_offset = PageGetMaxOffsetNumber(page);
		for (newoff = FirstOffsetNumber;
			 newoff <= max_offset;
			 newoff = OffsetNumberNext(newoff))
2329
		{
2330 2331 2332 2333 2334 2335
			itemid = PageGetItemId(page, newoff);
			if (!ItemIdIsUsed(itemid))
				continue;
			tuple.t_datamcxt = NULL;
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
2336
			{
2337 2338
				if (!(tuple.t_data->t_infomask & HEAP_MOVED))
					elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
2339 2340
				if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
					elog(ERROR, "Invalid XVAC in tuple header (2)");
2341
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
2342
				{
2343
					tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
2344
					tuple.t_data->t_infomask &= ~HEAP_MOVED;
2345
					num_tuples++;
2346
				}
2347
				else
2348
					tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
2349 2350
			}
		}
2351
		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2352
		WriteBuffer(buf);
2353 2354
		Assert((*curpage)->offsets_used == num_tuples);
		checked_moved += num_tuples;
V
Vadim B. Mikheev 已提交
2355
	}
B
Bruce Momjian 已提交
2356
	Assert(num_moved == checked_moved);
2357

2358
	elog(elevel, "Rel %s: Pages: %u --> %u; Tuple(s) moved: %u.\n\t%s",
2359
		 RelationGetRelationName(onerel),
B
Bruce Momjian 已提交
2360
		 nblocks, blkno, num_moved,
2361
		 vac_show_rusage(&ru0));
2362

B
Bruce Momjian 已提交
2363
	/*
2364 2365
	 * Reflect the motion of system tuples to catalog cache here.
	 */
2366
	CommandCounterIncrement();
2367

B
Bruce Momjian 已提交
2368
	if (Nvacpagelist.num_pages > 0)
V
Vadim B. Mikheev 已提交
2369
	{
2370
		/* vacuum indexes again if needed */
2371 2372
		if (Irel != (Relation *) NULL)
		{
B
Bruce Momjian 已提交
2373
			VacPage    *vpleft,
2374 2375
					   *vpright,
						vpsave;
2376

B
Bruce Momjian 已提交
2377 2378
			/* re-sort Nvacpagelist.pagedesc */
			for (vpleft = Nvacpagelist.pagedesc,
B
Bruce Momjian 已提交
2379
			vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
2380 2381 2382 2383 2384 2385
				 vpleft < vpright; vpleft++, vpright--)
			{
				vpsave = *vpleft;
				*vpleft = *vpright;
				*vpright = vpsave;
			}
2386
			Assert(keep_tuples >= 0);
2387
			for (i = 0; i < nindexes; i++)
B
Bruce Momjian 已提交
2388
				vacuum_index(&Nvacpagelist, Irel[i],
2389
							 vacrelstats->rel_tuples, keep_tuples);
2390 2391
		}

B
Bruce Momjian 已提交
2392
		/* clean moved tuples from last page in Nvacpagelist list */
2393
		if (vacpage->blkno == (blkno - 1) &&
B
Bruce Momjian 已提交
2394
			vacpage->offsets_free > 0)
2395
		{
2396
			OffsetNumber unused[BLCKSZ / sizeof(OffsetNumber)];
B
Bruce Momjian 已提交
2397
			int			uncnt;
2398

B
Bruce Momjian 已提交
2399
			buf = ReadBuffer(onerel, vacpage->blkno);
2400
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2401
			page = BufferGetPage(buf);
B
Bruce Momjian 已提交
2402
			num_tuples = 0;
2403
			maxoff = PageGetMaxOffsetNumber(page);
2404
			for (offnum = FirstOffsetNumber;
2405
				 offnum <= maxoff;
2406 2407 2408 2409 2410
				 offnum = OffsetNumberNext(offnum))
			{
				itemid = PageGetItemId(page, offnum);
				if (!ItemIdIsUsed(itemid))
					continue;
2411
				tuple.t_datamcxt = NULL;
2412
				tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2413 2414 2415 2416 2417

				if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
				{
					if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
					{
2418 2419
						if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
							elog(ERROR, "Invalid XVAC in tuple header (3)");
2420 2421 2422 2423
						itemid->lp_flags &= ~LP_USED;
						num_tuples++;
					}
					else
2424
						elog(ERROR, "HEAP_MOVED_OFF was expected (3)");
2425 2426
				}

2427
			}
B
Bruce Momjian 已提交
2428
			Assert(vacpage->offsets_free == num_tuples);
2429

2430
			START_CRIT_SECTION();
2431

2432
			uncnt = PageRepairFragmentation(page, unused);
2433 2434 2435

			/* XLOG stuff */
			if (!onerel->rd_istemp)
2436
			{
2437
				XLogRecPtr	recptr;
B
Bruce Momjian 已提交
2438

2439
				recptr = log_heap_clean(onerel, buf, unused, uncnt);
2440 2441 2442
				PageSetLSN(page, recptr);
				PageSetSUI(page, ThisStartUpID);
			}
2443 2444
			else
			{
B
Bruce Momjian 已提交
2445 2446 2447 2448
				/*
				 * No XLOG record, but still need to flag that XID exists
				 * on disk
				 */
2449 2450 2451
				MyXactMadeTempRelUpdate = true;
			}

2452
			END_CRIT_SECTION();
2453

2454
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2455 2456 2457
			WriteBuffer(buf);
		}

B
Bruce Momjian 已提交
2458
		/* now - free new list of reaped pages */
B
Bruce Momjian 已提交
2459 2460 2461 2462
		curpage = Nvacpagelist.pagedesc;
		for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
			pfree(*curpage);
		pfree(Nvacpagelist.pagedesc);
V
Vadim B. Mikheev 已提交
2463 2464
	}

2465 2466
	/*
	 * Flush dirty pages out to disk.  We do this unconditionally, even if
B
Bruce Momjian 已提交
2467 2468 2469
	 * we don't need to truncate, because we want to ensure that all
	 * tuples have correct on-row commit status on disk (see bufmgr.c's
	 * comments for FlushRelationBuffers()).
2470 2471 2472 2473 2474 2475 2476
	 */
	i = FlushRelationBuffers(onerel, blkno);
	if (i < 0)
		elog(ERROR, "VACUUM (repair_frag): FlushRelationBuffers returned %d",
			 i);

	/* truncate relation, if needed */
2477
	if (blkno < nblocks)
V
Vadim B. Mikheev 已提交
2478
	{
B
Bruce Momjian 已提交
2479
		blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
2480
		onerel->rd_nblocks = blkno;		/* update relcache immediately */
2481
		onerel->rd_targblock = InvalidBlockNumber;
2482
		vacrelstats->rel_pages = blkno; /* set new number of blocks */
2483 2484
	}

2485
	/* clean up */
B
Bruce Momjian 已提交
2486
	pfree(vacpage);
2487 2488
	if (vacrelstats->vtlinks != NULL)
		pfree(vacrelstats->vtlinks);
2489 2490 2491 2492

	ExecDropTupleTable(tupleTable, true);

	ExecCloseIndices(resultRelInfo);
2493 2494

	FreeExecutorState(estate);
B
Bruce Momjian 已提交
2495
}
V
Vadim B. Mikheev 已提交
2496 2497

/*
B
Bruce Momjian 已提交
2498
 *	vacuum_heap() -- free dead tuples
V
Vadim B. Mikheev 已提交
2499
 *
2500 2501
 *		This routine marks dead tuples as unused and truncates relation
 *		if there are "empty" end-blocks.
V
Vadim B. Mikheev 已提交
2502 2503
 */
static void
B
Bruce Momjian 已提交
2504
vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
V
Vadim B. Mikheev 已提交
2505
{
2506
	Buffer		buf;
B
Bruce Momjian 已提交
2507
	VacPage    *vacpage;
2508
	BlockNumber relblocks;
2509
	int			nblocks;
2510
	int			i;
2511

B
Bruce Momjian 已提交
2512
	nblocks = vacuum_pages->num_pages;
B
Bruce Momjian 已提交
2513
	nblocks -= vacuum_pages->empty_end_pages;	/* nothing to do with them */
2514

B
Bruce Momjian 已提交
2515
	for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
2516
	{
2517
		CHECK_FOR_INTERRUPTS();
B
Bruce Momjian 已提交
2518
		if ((*vacpage)->offsets_free > 0)
2519
		{
B
Bruce Momjian 已提交
2520
			buf = ReadBuffer(onerel, (*vacpage)->blkno);
2521 2522 2523
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			vacuum_page(onerel, buf, *vacpage);
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2524 2525
			WriteBuffer(buf);
		}
V
Vadim B. Mikheev 已提交
2526 2527
	}

2528 2529
	/*
	 * Flush dirty pages out to disk.  We do this unconditionally, even if
B
Bruce Momjian 已提交
2530 2531 2532
	 * we don't need to truncate, because we want to ensure that all
	 * tuples have correct on-row commit status on disk (see bufmgr.c's
	 * comments for FlushRelationBuffers()).
2533
	 */
2534
	Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
2535
	relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
2536

2537
	i = FlushRelationBuffers(onerel, relblocks);
2538 2539 2540 2541
	if (i < 0)
		elog(ERROR, "VACUUM (vacuum_heap): FlushRelationBuffers returned %d",
			 i);

2542
	/* truncate relation if there are some empty end-pages */
B
Bruce Momjian 已提交
2543
	if (vacuum_pages->empty_end_pages > 0)
2544
	{
2545
		elog(elevel, "Rel %s: Pages: %u --> %u.",
2546
			 RelationGetRelationName(onerel),
2547 2548
			 vacrelstats->rel_pages, relblocks);
		relblocks = smgrtruncate(DEFAULT_SMGR, onerel, relblocks);
2549
		onerel->rd_nblocks = relblocks; /* update relcache immediately */
2550
		onerel->rd_targblock = InvalidBlockNumber;
2551
		vacrelstats->rel_pages = relblocks;		/* set new number of
B
Bruce Momjian 已提交
2552
												 * blocks */
2553
	}
B
Bruce Momjian 已提交
2554
}
V
Vadim B. Mikheev 已提交
2555 2556

/*
B
Bruce Momjian 已提交
2557
 *	vacuum_page() -- free dead tuples on a page
2558
 *					 and repair its fragmentation.
V
Vadim B. Mikheev 已提交
2559 2560
 */
static void
2561
vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
V
Vadim B. Mikheev 已提交
2562
{
2563
	OffsetNumber unused[BLCKSZ / sizeof(OffsetNumber)];
B
Bruce Momjian 已提交
2564 2565 2566 2567
	int			uncnt;
	Page		page = BufferGetPage(buffer);
	ItemId		itemid;
	int			i;
2568

2569
	/* There shouldn't be any tuples moved onto the page yet! */
B
Bruce Momjian 已提交
2570
	Assert(vacpage->offsets_used == 0);
2571

2572
	START_CRIT_SECTION();
2573

B
Bruce Momjian 已提交
2574
	for (i = 0; i < vacpage->offsets_free; i++)
V
Vadim B. Mikheev 已提交
2575
	{
2576
		itemid = PageGetItemId(page, vacpage->offsets[i]);
2577
		itemid->lp_flags &= ~LP_USED;
V
Vadim B. Mikheev 已提交
2578
	}
2579

2580
	uncnt = PageRepairFragmentation(page, unused);
2581 2582 2583

	/* XLOG stuff */
	if (!onerel->rd_istemp)
2584
	{
2585
		XLogRecPtr	recptr;
B
Bruce Momjian 已提交
2586

2587
		recptr = log_heap_clean(onerel, buffer, unused, uncnt);
2588 2589 2590
		PageSetLSN(page, recptr);
		PageSetSUI(page, ThisStartUpID);
	}
2591 2592 2593 2594 2595 2596
	else
	{
		/* No XLOG record, but still need to flag that XID exists on disk */
		MyXactMadeTempRelUpdate = true;
	}

2597
	END_CRIT_SECTION();
B
Bruce Momjian 已提交
2598
}
2599

2600
/*
2601
 *	scan_index() -- scan one index relation to update statistic.
2602 2603
 *
 * We use this when we have no deletions to do.
2604 2605
 */
static void
2606
scan_index(Relation indrel, double num_tuples)
2607
{
2608
	IndexBulkDeleteResult *stats;
2609
	IndexVacuumCleanupInfo vcinfo;
2610
	VacRUsage	ru0;
2611

2612
	vac_init_rusage(&ru0);
2613

2614
	/*
2615 2616 2617 2618
	 * Even though we're not planning to delete anything, we use the
	 * ambulkdelete call, because (a) the scan happens within the index AM
	 * for more speed, and (b) it may want to pass private statistics to
	 * the amvacuumcleanup call.
2619 2620
	 */
	stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
2621

2622 2623 2624 2625 2626 2627
	/* Do post-VACUUM cleanup, even though we deleted nothing */
	vcinfo.vacuum_full = true;
	vcinfo.message_level = elevel;

	stats = index_vacuum_cleanup(indrel, &vcinfo, stats);

2628 2629
	if (!stats)
		return;
2630

2631
	/* now update statistics in pg_class */
2632 2633 2634
	vac_update_relstats(RelationGetRelid(indrel),
						stats->num_pages, stats->num_index_tuples,
						false);
2635

2636
	elog(elevel, "Index %s: Pages %u, %u deleted, %u free; Tuples %.0f.\n\t%s",
2637
		 RelationGetRelationName(indrel),
2638 2639
		 stats->num_pages, stats->pages_deleted, stats->pages_free,
		 stats->num_index_tuples,
2640
		 vac_show_rusage(&ru0));
2641

2642
	/*
2643 2644
	 * Check for tuple count mismatch.	If the index is partial, then it's
	 * OK for it to have fewer tuples than the heap; else we got trouble.
2645
	 */
2646
	if (stats->num_index_tuples != num_tuples)
2647
	{
2648
		if (stats->num_index_tuples > num_tuples ||
2649
			!vac_is_partial_index(indrel))
2650 2651
			elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f)."
				 "\n\tRecreate the index.",
2652 2653
				 RelationGetRelationName(indrel),
				 stats->num_index_tuples, num_tuples);
2654
	}
2655 2656

	pfree(stats);
B
Bruce Momjian 已提交
2657
}
2658

2659
/*
B
Bruce Momjian 已提交
2660
 *	vacuum_index() -- vacuum one index relation.
2661
 *
B
Bruce Momjian 已提交
2662
 *		Vpl is the VacPageList of the heap we're currently vacuuming.
2663
 *		It's locked. Indrel is an index relation on the vacuumed heap.
2664 2665 2666
 *
 *		We don't bother to set locks on the index relation here, since
 *		the parent table is exclusive-locked already.
2667
 *
2668 2669
 *		Finally, we arrange to update the index relation's statistics in
 *		pg_class.
2670 2671
 */
static void
2672
vacuum_index(VacPageList vacpagelist, Relation indrel,
2673
			 double num_tuples, int keep_tuples)
2674
{
2675
	IndexBulkDeleteResult *stats;
2676
	IndexVacuumCleanupInfo vcinfo;
2677
	VacRUsage	ru0;
2678

2679
	vac_init_rusage(&ru0);
2680

2681 2682
	/* Do bulk deletion */
	stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist);
2683

2684 2685 2686 2687 2688 2689
	/* Do post-VACUUM cleanup */
	vcinfo.vacuum_full = true;
	vcinfo.message_level = elevel;

	stats = index_vacuum_cleanup(indrel, &vcinfo, stats);

2690 2691
	if (!stats)
		return;
2692

2693
	/* now update statistics in pg_class */
2694
	vac_update_relstats(RelationGetRelid(indrel),
2695 2696
						stats->num_pages, stats->num_index_tuples,
						false);
2697

2698
	elog(elevel, "Index %s: Pages %u, %u deleted, %u free; Tuples %.0f: Deleted %.0f.\n\t%s",
2699
		 RelationGetRelationName(indrel),
2700
		 stats->num_pages, stats->pages_deleted, stats->pages_free,
2701
		 stats->num_index_tuples - keep_tuples, stats->tuples_removed,
2702
		 vac_show_rusage(&ru0));
2703

2704
	/*
2705 2706
	 * Check for tuple count mismatch.	If the index is partial, then it's
	 * OK for it to have fewer tuples than the heap; else we got trouble.
2707
	 */
2708
	if (stats->num_index_tuples != num_tuples + keep_tuples)
2709
	{
2710
		if (stats->num_index_tuples > num_tuples + keep_tuples ||
2711
			!vac_is_partial_index(indrel))
2712 2713
			elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f)."
				 "\n\tRecreate the index.",
2714 2715
				 RelationGetRelationName(indrel),
				 stats->num_index_tuples, num_tuples);
2716
	}
2717 2718

	pfree(stats);
B
Bruce Momjian 已提交
2719
}
V
Vadim B. Mikheev 已提交
2720 2721

/*
B
Bruce Momjian 已提交
2722
 *	tid_reaped() -- is a particular tid reaped?
V
Vadim B. Mikheev 已提交
2723
 *
2724 2725
 *		This has the right signature to be an IndexBulkDeleteCallback.
 *
B
Bruce Momjian 已提交
2726
 *		vacpagelist->VacPage_array is sorted in right order.
V
Vadim B. Mikheev 已提交
2727
 */
2728 2729
static bool
tid_reaped(ItemPointer itemptr, void *state)
V
Vadim B. Mikheev 已提交
2730
{
2731
	VacPageList vacpagelist = (VacPageList) state;
2732 2733
	OffsetNumber ioffno;
	OffsetNumber *voff;
B
Bruce Momjian 已提交
2734
	VacPage		vp,
2735
			   *vpp;
B
Bruce Momjian 已提交
2736
	VacPageData vacpage;
V
Vadim B. Mikheev 已提交
2737

B
Bruce Momjian 已提交
2738
	vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
2739
	ioffno = ItemPointerGetOffsetNumber(itemptr);
V
Vadim B. Mikheev 已提交
2740

B
Bruce Momjian 已提交
2741
	vp = &vacpage;
2742 2743 2744 2745
	vpp = (VacPage *) vac_bsearch((void *) &vp,
								  (void *) (vacpagelist->pagedesc),
								  vacpagelist->num_pages,
								  sizeof(VacPage),
B
Bruce Momjian 已提交
2746
								  vac_cmp_blk);
V
Vadim B. Mikheev 已提交
2747

2748 2749
	if (vpp == NULL)
		return false;
2750

2751 2752
	/* ok - we are on a partially or fully reaped page */
	vp = *vpp;
2753

B
Bruce Momjian 已提交
2754
	if (vp->offsets_free == 0)
2755 2756
	{
		/* this is EmptyPage, so claim all tuples on it are reaped!!! */
2757
		return true;
2758 2759
	}

2760 2761 2762 2763
	voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
										(void *) (vp->offsets),
										vp->offsets_free,
										sizeof(OffsetNumber),
B
Bruce Momjian 已提交
2764
										vac_cmp_offno);
2765

2766 2767
	if (voff == NULL)
		return false;
2768

2769
	/* tid is reaped */
2770
	return true;
B
Bruce Momjian 已提交
2771
}
2772

2773 2774 2775 2776 2777 2778 2779 2780 2781
/*
 * Dummy version for scan_index.
 */
static bool
dummy_tid_reaped(ItemPointer itemptr, void *state)
{
	return false;
}

2782 2783 2784 2785 2786 2787 2788 2789 2790
/*
 * Update the shared Free Space Map with the info we now have about
 * free space in the relation, discarding any old info the map may have.
 */
static void
vac_update_fsm(Relation onerel, VacPageList fraged_pages,
			   BlockNumber rel_pages)
{
	int			nPages = fraged_pages->num_pages;
2791 2792
	VacPage    *pagedesc = fraged_pages->pagedesc;
	Size		threshold;
2793
	PageFreeSpaceInfo *pageSpaces;
2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807
	int			outPages;
	int			i;

	/*
	 * We only report pages with free space at least equal to the average
	 * request size --- this avoids cluttering FSM with uselessly-small bits
	 * of space.  Although FSM would discard pages with little free space
	 * anyway, it's important to do this prefiltering because (a) it reduces
	 * the time spent holding the FSM lock in RecordRelationFreeSpace, and
	 * (b) FSM uses the number of pages reported as a statistic for guiding
	 * space management.  If we didn't threshold our reports the same way
	 * vacuumlazy.c does, we'd be skewing that statistic.
	 */
	threshold = GetAvgFSMRequestSize(&onerel->rd_node);
2808 2809

	/* +1 to avoid palloc(0) */
2810 2811
	pageSpaces = (PageFreeSpaceInfo *)
		palloc((nPages + 1) * sizeof(PageFreeSpaceInfo));
2812
	outPages = 0;
2813 2814 2815 2816

	for (i = 0; i < nPages; i++)
	{
		/*
2817 2818
		 * fraged_pages may contain entries for pages that we later
		 * decided to truncate from the relation; don't enter them into
2819
		 * the free space map!
2820
		 */
2821
		if (pagedesc[i]->blkno >= rel_pages)
2822
			break;
2823 2824 2825 2826 2827 2828

		if (pagedesc[i]->free >= threshold)
		{
			pageSpaces[outPages].blkno = pagedesc[i]->blkno;
			pageSpaces[outPages].avail = pagedesc[i]->free;
			outPages++;
2829 2830 2831
		}
	}

2832
	RecordRelationFreeSpace(&onerel->rd_node, outPages, pageSpaces);
2833 2834

	pfree(pageSpaces);
2835 2836
}

2837 2838 2839
/* Copy a VacPage structure */
static VacPage
copy_vac_page(VacPage vacpage)
2840
{
B
Bruce Momjian 已提交
2841
	VacPage		newvacpage;
2842

B
Bruce Momjian 已提交
2843
	/* allocate a VacPageData entry */
2844
	newvacpage = (VacPage) palloc(sizeof(VacPageData) +
2845
						   vacpage->offsets_free * sizeof(OffsetNumber));
2846

2847
	/* fill it in */
B
Bruce Momjian 已提交
2848
	if (vacpage->offsets_free > 0)
2849 2850
		memcpy(newvacpage->offsets, vacpage->offsets,
			   vacpage->offsets_free * sizeof(OffsetNumber));
B
Bruce Momjian 已提交
2851 2852 2853 2854
	newvacpage->blkno = vacpage->blkno;
	newvacpage->free = vacpage->free;
	newvacpage->offsets_used = vacpage->offsets_used;
	newvacpage->offsets_free = vacpage->offsets_free;
2855

2856
	return newvacpage;
B
Bruce Momjian 已提交
2857
}
V
Vadim B. Mikheev 已提交
2858

2859 2860 2861 2862 2863 2864 2865
/*
 * Add a VacPage pointer to a VacPageList.
 *
 *		As a side effect of the way that scan_heap works,
 *		higher pages come after lower pages in the array
 *		(and highest tid on a page is last).
 */
B
Bruce Momjian 已提交
2866 2867
static void
vpage_insert(VacPageList vacpagelist, VacPage vpnew)
V
Vadim B. Mikheev 已提交
2868
{
T
Tatsuo Ishii 已提交
2869
#define PG_NPAGEDESC 1024
V
Vadim B. Mikheev 已提交
2870

B
Bruce Momjian 已提交
2871 2872
	/* allocate a VacPage entry if needed */
	if (vacpagelist->num_pages == 0)
T
Tatsuo Ishii 已提交
2873
	{
B
Bruce Momjian 已提交
2874 2875
		vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
		vacpagelist->num_allocated_pages = PG_NPAGEDESC;
T
Tatsuo Ishii 已提交
2876
	}
B
Bruce Momjian 已提交
2877
	else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
T
Tatsuo Ishii 已提交
2878
	{
B
Bruce Momjian 已提交
2879 2880
		vacpagelist->num_allocated_pages *= 2;
		vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
T
Tatsuo Ishii 已提交
2881
	}
B
Bruce Momjian 已提交
2882 2883
	vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
	(vacpagelist->num_pages)++;
2884 2885
}

2886 2887 2888
/*
 * vac_bsearch: just like standard C library routine bsearch(),
 * except that we first test to see whether the target key is outside
2889
 * the range of the table entries.	This case is handled relatively slowly
2890 2891 2892
 * by the normal binary search algorithm (ie, no faster than any other key)
 * but it occurs often enough in VACUUM to be worth optimizing.
 */
2893
static void *
2894 2895
vac_bsearch(const void *key, const void *base,
			size_t nelem, size_t size,
B
Bruce Momjian 已提交
2896
			int (*compar) (const void *, const void *))
2897
{
2898
	int			res;
2899 2900 2901 2902 2903 2904 2905 2906 2907 2908
	const void *last;

	if (nelem == 0)
		return NULL;
	res = compar(key, base);
	if (res < 0)
		return NULL;
	if (res == 0)
		return (void *) base;
	if (nelem > 1)
2909
	{
2910 2911 2912
		last = (const void *) ((const char *) base + (nelem - 1) * size);
		res = compar(key, last);
		if (res > 0)
2913
			return NULL;
2914 2915
		if (res == 0)
			return (void *) last;
2916
	}
2917 2918 2919
	if (nelem <= 2)
		return NULL;			/* already checked 'em all */
	return bsearch(key, base, nelem, size, compar);
B
Bruce Momjian 已提交
2920
}
2921

2922 2923 2924
/*
 * Comparator routines for use with qsort() and bsearch().
 */
2925
static int
B
Bruce Momjian 已提交
2926
vac_cmp_blk(const void *left, const void *right)
2927
{
2928 2929
	BlockNumber lblk,
				rblk;
2930

B
Bruce Momjian 已提交
2931 2932
	lblk = (*((VacPage *) left))->blkno;
	rblk = (*((VacPage *) right))->blkno;
2933

2934
	if (lblk < rblk)
2935
		return -1;
2936
	if (lblk == rblk)
2937 2938
		return 0;
	return 1;
B
Bruce Momjian 已提交
2939
}
2940

2941
static int
B
Bruce Momjian 已提交
2942
vac_cmp_offno(const void *left, const void *right)
2943
{
2944
	if (*(OffsetNumber *) left < *(OffsetNumber *) right)
2945
		return -1;
2946
	if (*(OffsetNumber *) left == *(OffsetNumber *) right)
2947 2948
		return 0;
	return 1;
B
Bruce Momjian 已提交
2949
}
V
Vadim B. Mikheev 已提交
2950

2951
static int
B
Bruce Momjian 已提交
2952
vac_cmp_vtlinks(const void *left, const void *right)
2953
{
B
Bruce Momjian 已提交
2954 2955
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2956
		return -1;
B
Bruce Momjian 已提交
2957 2958
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2959 2960
		return 1;
	/* bi_hi-es are equal */
B
Bruce Momjian 已提交
2961 2962
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2963
		return -1;
B
Bruce Momjian 已提交
2964 2965
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2966 2967
		return 1;
	/* bi_lo-es are equal */
B
Bruce Momjian 已提交
2968 2969
	if (((VTupleLink) left)->new_tid.ip_posid <
		((VTupleLink) right)->new_tid.ip_posid)
2970
		return -1;
B
Bruce Momjian 已提交
2971 2972
	if (((VTupleLink) left)->new_tid.ip_posid >
		((VTupleLink) right)->new_tid.ip_posid)
2973 2974 2975
		return 1;
	return 0;
}
V
Vadim B. Mikheev 已提交
2976

2977

2978 2979
void
vac_open_indexes(Relation relation, int *nindexes, Relation **Irel)
V
Vadim B. Mikheev 已提交
2980
{
2981 2982 2983
	List	   *indexoidlist,
			   *indexoidscan;
	int			i;
V
Vadim B. Mikheev 已提交
2984

2985
	indexoidlist = RelationGetIndexList(relation);
2986

2987
	*nindexes = length(indexoidlist);
2988

2989 2990
	if (*nindexes > 0)
		*Irel = (Relation *) palloc(*nindexes * sizeof(Relation));
2991 2992
	else
		*Irel = NULL;
2993

2994 2995
	i = 0;
	foreach(indexoidscan, indexoidlist)
2996
	{
2997
		Oid			indexoid = lfirsto(indexoidscan);
V
Vadim B. Mikheev 已提交
2998

2999 3000
		(*Irel)[i] = index_open(indexoid);
		i++;
3001 3002
	}

3003
	freeList(indexoidlist);
B
Bruce Momjian 已提交
3004
}
V
Vadim B. Mikheev 已提交
3005 3006


3007 3008
void
vac_close_indexes(int nindexes, Relation *Irel)
V
Vadim B. Mikheev 已提交
3009
{
3010 3011
	if (Irel == (Relation *) NULL)
		return;
V
Vadim B. Mikheev 已提交
3012

3013 3014
	while (nindexes--)
		index_close(Irel[nindexes]);
3015
	pfree(Irel);
B
Bruce Momjian 已提交
3016
}
V
Vadim B. Mikheev 已提交
3017 3018


3019 3020 3021 3022 3023
/*
 * Is an index partial (ie, could it contain fewer tuples than the heap?)
 */
bool
vac_is_partial_index(Relation indrel)
V
Vadim B. Mikheev 已提交
3024
{
3025
	/*
3026 3027
	 * If the index's AM doesn't support nulls, it's partial for our
	 * purposes
3028
	 */
3029
	if (!indrel->rd_am->amindexnulls)
3030 3031 3032
		return true;

	/* Otherwise, look to see if there's a partial-index predicate */
3033
	return (VARSIZE(&indrel->rd_index->indpred) > VARHDRSZ);
B
Bruce Momjian 已提交
3034
}
3035 3036


3037
static bool
B
Bruce Momjian 已提交
3038
enough_space(VacPage vacpage, Size len)
V
Vadim B. Mikheev 已提交
3039
{
3040
	len = MAXALIGN(len);
3041

B
Bruce Momjian 已提交
3042
	if (len > vacpage->free)
3043
		return false;
3044

3045 3046 3047
	/* if there are free itemid(s) and len <= free_space... */
	if (vacpage->offsets_used < vacpage->offsets_free)
		return true;
3048

3049 3050
	/* noff_used >= noff_free and so we'll have to allocate new itemid */
	if (len + sizeof(ItemIdData) <= vacpage->free)
3051
		return true;
3052

3053
	return false;
B
Bruce Momjian 已提交
3054
}
3055 3056


3057 3058 3059
/*
 * Initialize usage snapshot.
 */
3060 3061
void
vac_init_rusage(VacRUsage *ru0)
3062 3063 3064 3065 3066 3067 3068
{
	struct timezone tz;

	getrusage(RUSAGE_SELF, &ru0->ru);
	gettimeofday(&ru0->tv, &tz);
}

3069 3070 3071 3072 3073 3074
/*
 * Compute elapsed time since ru0 usage snapshot, and format into
 * a displayable string.  Result is in a static string, which is
 * tacky, but no one ever claimed that the Postgres backend is
 * threadable...
 */
3075 3076
const char *
vac_show_rusage(VacRUsage *ru0)
3077
{
3078 3079
	static char result[100];
	VacRUsage	ru1;
3080

3081
	vac_init_rusage(&ru1);
3082

3083 3084 3085 3086 3087 3088
	if (ru1.tv.tv_usec < ru0->tv.tv_usec)
	{
		ru1.tv.tv_sec--;
		ru1.tv.tv_usec += 1000000;
	}
	if (ru1.ru.ru_stime.tv_usec < ru0->ru.ru_stime.tv_usec)
3089
	{
3090 3091
		ru1.ru.ru_stime.tv_sec--;
		ru1.ru.ru_stime.tv_usec += 1000000;
3092
	}
3093
	if (ru1.ru.ru_utime.tv_usec < ru0->ru.ru_utime.tv_usec)
3094
	{
3095 3096
		ru1.ru.ru_utime.tv_sec--;
		ru1.ru.ru_utime.tv_usec += 1000000;
3097 3098 3099
	}

	snprintf(result, sizeof(result),
3100 3101
			 "CPU %d.%02ds/%d.%02du sec elapsed %d.%02d sec.",
			 (int) (ru1.ru.ru_stime.tv_sec - ru0->ru.ru_stime.tv_sec),
3102
	  (int) (ru1.ru.ru_stime.tv_usec - ru0->ru.ru_stime.tv_usec) / 10000,
3103
			 (int) (ru1.ru.ru_utime.tv_sec - ru0->ru.ru_utime.tv_sec),
3104
	  (int) (ru1.ru.ru_utime.tv_usec - ru0->ru.ru_utime.tv_usec) / 10000,
3105 3106
			 (int) (ru1.tv.tv_sec - ru0->tv.tv_sec),
			 (int) (ru1.tv.tv_usec - ru0->tv.tv_usec) / 10000);
3107 3108 3109

	return result;
}