vacuum.c 86.3 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * vacuum.c
4 5 6
 *	  The postgres vacuum cleaner.
 *
 * This file includes the "full" version of VACUUM, as well as control code
7
 * used by all three of full VACUUM, lazy VACUUM, and ANALYZE.	See
8 9
 * vacuumlazy.c and analyze.c for the rest of the code for the latter two.
 *
10
 *
B
Bruce Momjian 已提交
11
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
12
 * Portions Copyright (c) 1994, Regents of the University of California
13 14 15
 *
 *
 * IDENTIFICATION
16
 *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.242 2002/10/19 20:15:09 tgl Exp $
17 18 19
 *
 *-------------------------------------------------------------------------
 */
20 21
#include "postgres.h"

22
#include <unistd.h>
23

24
#include "access/clog.h"
B
Bruce Momjian 已提交
25 26
#include "access/genam.h"
#include "access/heapam.h"
27
#include "access/xlog.h"
B
Bruce Momjian 已提交
28 29
#include "catalog/catalog.h"
#include "catalog/catname.h"
30
#include "catalog/namespace.h"
31
#include "catalog/pg_database.h"
32
#include "catalog/pg_index.h"
B
Bruce Momjian 已提交
33
#include "commands/vacuum.h"
34
#include "executor/executor.h"
B
Bruce Momjian 已提交
35
#include "miscadmin.h"
36
#include "storage/freespace.h"
37
#include "storage/sinval.h"
B
Bruce Momjian 已提交
38
#include "storage/smgr.h"
39
#include "tcop/pquery.h"
40
#include "utils/acl.h"
B
Bruce Momjian 已提交
41
#include "utils/builtins.h"
42
#include "utils/fmgroids.h"
B
Bruce Momjian 已提交
43
#include "utils/inval.h"
44
#include "utils/lsyscache.h"
45
#include "utils/relcache.h"
B
Bruce Momjian 已提交
46
#include "utils/syscache.h"
47 48
#include "pgstat.h"

49 50 51 52 53 54 55

typedef struct VacPageData
{
	BlockNumber blkno;			/* BlockNumber of this Page */
	Size		free;			/* FreeSpace on this Page */
	uint16		offsets_used;	/* Number of OffNums used by vacuum */
	uint16		offsets_free;	/* Number of OffNums free or to be free */
56
	OffsetNumber offsets[1];	/* Array of free OffNums */
57 58 59 60 61 62
} VacPageData;

typedef VacPageData *VacPage;

typedef struct VacPageListData
{
63
	BlockNumber empty_end_pages;	/* Number of "empty" end-pages */
64
	int			num_pages;		/* Number of pages in pagedesc */
65 66
	int			num_allocated_pages;	/* Number of allocated pages in
										 * pagedesc */
67
	VacPage    *pagedesc;		/* Descriptions of pages */
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
} VacPageListData;

typedef VacPageListData *VacPageList;

typedef struct VTupleLinkData
{
	ItemPointerData new_tid;
	ItemPointerData this_tid;
} VTupleLinkData;

typedef VTupleLinkData *VTupleLink;

typedef struct VTupleMoveData
{
	ItemPointerData tid;		/* tuple ID */
	VacPage		vacpage;		/* where to move */
	bool		cleanVpd;		/* clean vacpage before using */
} VTupleMoveData;

typedef VTupleMoveData *VTupleMove;

typedef struct VRelStats
{
91
	BlockNumber rel_pages;
92
	double		rel_tuples;
93 94 95 96 97 98 99 100
	Size		min_tlen;
	Size		max_tlen;
	bool		hasindex;
	int			num_vtlinks;
	VTupleLink	vtlinks;
} VRelStats;


101
static MemoryContext vac_context = NULL;
102

B
Bruce Momjian 已提交
103
static int	elevel = -1;
V
Vadim B. Mikheev 已提交
104

105 106 107
static TransactionId OldestXmin;
static TransactionId FreezeLimit;

108

109
/* non-export function prototypes */
110
static List *getrels(const RangeVar *vacrel, const char *stmttype);
111
static void vac_update_dbstats(Oid dbid,
112 113
				   TransactionId vacuumXID,
				   TransactionId frozenXID);
114
static void vac_truncate_clog(TransactionId vacuumXID,
115
				  TransactionId frozenXID);
T
ARGH!  
Tom Lane 已提交
116
static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind);
117
static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
118
static void scan_heap(VRelStats *vacrelstats, Relation onerel,
119
		  VacPageList vacuum_pages, VacPageList fraged_pages);
120
static void repair_frag(VRelStats *vacrelstats, Relation onerel,
121 122
			VacPageList vacuum_pages, VacPageList fraged_pages,
			int nindexes, Relation *Irel);
123
static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
124
			VacPageList vacpagelist);
125
static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
126
static void vacuum_index(VacPageList vacpagelist, Relation indrel,
127
			 double num_tuples, int keep_tuples);
128
static void scan_index(Relation indrel, double num_tuples);
129
static bool tid_reaped(ItemPointer itemptr, void *state);
130
static bool dummy_tid_reaped(ItemPointer itemptr, void *state);
131
static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
132
			   BlockNumber rel_pages);
133
static VacPage copy_vac_page(VacPage vacpage);
B
Bruce Momjian 已提交
134
static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
135
static void *vac_bsearch(const void *key, const void *base,
136 137
			size_t nelem, size_t size,
			int (*compar) (const void *, const void *));
B
Bruce Momjian 已提交
138 139 140
static int	vac_cmp_blk(const void *left, const void *right);
static int	vac_cmp_offno(const void *left, const void *right);
static int	vac_cmp_vtlinks(const void *left, const void *right);
B
Bruce Momjian 已提交
141
static bool enough_space(VacPage vacpage, Size len);
142 143 144 145 146 147 148 149


/****************************************************************************
 *																			*
 *			Code common to all flavors of VACUUM and ANALYZE				*
 *																			*
 ****************************************************************************
 */
150

151

152 153 154
/*
 * Primary entry point for VACUUM and ANALYZE commands.
 */
155
void
156
vacuum(VacuumStmt *vacstmt)
157
{
158
	const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
159
	MemoryContext anl_context = NULL;
T
ARGH!  
Tom Lane 已提交
160 161 162
	TransactionId initialOldestXmin = InvalidTransactionId;
	TransactionId initialFreezeLimit = InvalidTransactionId;
	bool		all_rels;
163 164 165 166 167 168 169
	List	   *vrl,
			   *cur;

	if (vacstmt->verbose)
		elevel = INFO;
	else
		elevel = DEBUG1;
170

171
	/*
172
	 * We cannot run VACUUM inside a user transaction block; if we were
B
Bruce Momjian 已提交
173 174 175 176 177 178
	 * inside a transaction, then our commit- and
	 * start-transaction-command calls would not have the intended effect!
	 * Furthermore, the forced commit that occurs before truncating the
	 * relation's file would have the effect of committing the rest of the
	 * user's transaction too, which would certainly not be the desired
	 * behavior.
179
	 */
180
	if (vacstmt->vacuum && IsTransactionBlock())
181
		elog(ERROR, "%s cannot run inside a BEGIN/END block", stmttype);
182

183
	/* Running VACUUM from a function would free the function context */
184
	if (vacstmt->vacuum && !MemoryContextContains(QueryContext, vacstmt))
185
		elog(ERROR, "%s cannot be executed from a function", stmttype);
186

187 188 189
	/*
	 * Send info about dead objects to the statistics collector
	 */
190 191
	if (vacstmt->vacuum)
		pgstat_vacuum_tabstat();
192

193 194 195
	/*
	 * Create special memory context for cross-transaction storage.
	 *
B
Bruce Momjian 已提交
196 197 198
	 * Since it is a child of QueryContext, it will go away eventually even
	 * if we suffer an error; there's no need for special abort cleanup
	 * logic.
199 200 201 202 203 204
	 */
	vac_context = AllocSetContextCreate(QueryContext,
										"Vacuum",
										ALLOCSET_DEFAULT_MINSIZE,
										ALLOCSET_DEFAULT_INITSIZE,
										ALLOCSET_DEFAULT_MAXSIZE);
205

206
	/*
B
Bruce Momjian 已提交
207 208 209
	 * If we are running only ANALYZE, we don't need per-table
	 * transactions, but we still need a memory context with table
	 * lifetime.
210
	 */
211 212 213 214 215 216 217
	if (vacstmt->analyze && !vacstmt->vacuum)
		anl_context = AllocSetContextCreate(QueryContext,
											"Analyze",
											ALLOCSET_DEFAULT_MINSIZE,
											ALLOCSET_DEFAULT_INITSIZE,
											ALLOCSET_DEFAULT_MAXSIZE);

T
ARGH!  
Tom Lane 已提交
218 219 220
	/* Assume we are processing everything unless one table is mentioned */
	all_rels = (vacstmt->relation == NULL);

221
	/* Build list of relations to process (note this lives in vac_context) */
222
	vrl = getrels(vacstmt->relation, stmttype);
223

224
	/*
225 226 227
	 * Formerly, there was code here to prevent more than one VACUUM from
	 * executing concurrently in the same database.  However, there's no
	 * good reason to prevent that, and manually removing lockfiles after
B
Bruce Momjian 已提交
228 229
	 * a vacuum crash was a pain for dbadmins.	So, forget about
	 * lockfiles, and just rely on the locks we grab on each target table
230 231 232 233 234
	 * to ensure that there aren't two VACUUMs running on the same table
	 * at the same time.
	 */

	/*
B
Bruce Momjian 已提交
235 236 237 238 239
	 * The strangeness with committing and starting transactions here is
	 * due to wanting to run each table's VACUUM as a separate
	 * transaction, so that we don't hold locks unnecessarily long.  Also,
	 * if we are doing VACUUM ANALYZE, the ANALYZE part runs as a separate
	 * transaction from the VACUUM to further reduce locking.
240 241 242 243 244
	 *
	 * vacuum_rel expects to be entered with no transaction active; it will
	 * start and commit its own transaction.  But we are called by an SQL
	 * command, and so we are executing inside a transaction already.  We
	 * commit the transaction started in PostgresMain() here, and start
B
Bruce Momjian 已提交
245 246
	 * another one before exiting to match the commit waiting for us back
	 * in PostgresMain().
247
	 *
248
	 * In the case of an ANALYZE statement (no vacuum, just analyze) it's
B
Bruce Momjian 已提交
249 250
	 * okay to run the whole thing in the outer transaction, and so we
	 * skip transaction start/stop operations.
251
	 */
252 253
	if (vacstmt->vacuum)
	{
T
ARGH!  
Tom Lane 已提交
254
		if (all_rels)
255 256
		{
			/*
257 258
			 * It's a database-wide VACUUM.
			 *
259 260
			 * Compute the initially applicable OldestXmin and FreezeLimit
			 * XIDs, so that we can record these values at the end of the
B
Bruce Momjian 已提交
261 262 263
			 * VACUUM. Note that individual tables may well be processed
			 * with newer values, but we can guarantee that no
			 * (non-shared) relations are processed with older ones.
264
			 *
B
Bruce Momjian 已提交
265 266 267 268 269 270 271 272 273 274
			 * It is okay to record non-shared values in pg_database, even
			 * though we may vacuum shared relations with older cutoffs,
			 * because only the minimum of the values present in
			 * pg_database matters.  We can be sure that shared relations
			 * have at some time been vacuumed with cutoffs no worse than
			 * the global minimum; for, if there is a backend in some
			 * other DB with xmin = OLDXMIN that's determining the cutoff
			 * with which we vacuum shared relations, it is not possible
			 * for that database to have a cutoff newer than OLDXMIN
			 * recorded in pg_database.
275 276
			 */
			vacuum_set_xid_limits(vacstmt, false,
T
ARGH!  
Tom Lane 已提交
277 278
								  &initialOldestXmin,
								  &initialFreezeLimit);
279 280 281
		}

		/* matches the StartTransaction in PostgresMain() */
282
		CommitTransactionCommand(true);
283
	}
284

285
	/*
286
	 * Loop to process each selected relation.
287
	 */
288
	foreach(cur, vrl)
289
	{
B
Bruce Momjian 已提交
290
		Oid			relid = (Oid) lfirsti(cur);
291

292
		if (vacstmt->vacuum)
T
ARGH!  
Tom Lane 已提交
293 294 295 296
		{
			if (! vacuum_rel(relid, vacstmt, RELKIND_RELATION))
				all_rels = false; /* forget about updating dbstats */
		}
297
		if (vacstmt->analyze)
298
		{
299 300 301
			MemoryContext old_context = NULL;

			/*
B
Bruce Momjian 已提交
302 303 304 305 306
			 * If we vacuumed, use new transaction for analyze.
			 * Otherwise, we can use the outer transaction, but we still
			 * need to call analyze_rel in a memory context that will be
			 * cleaned up on return (else we leak memory while processing
			 * multiple tables).
307
			 */
308
			if (vacstmt->vacuum)
309
			{
310
				StartTransactionCommand(true);
311 312
				SetQuerySnapshot();	/* might be needed for functional index */
			}
313 314 315
			else
				old_context = MemoryContextSwitchTo(anl_context);

316
			analyze_rel(relid, vacstmt);
317 318

			if (vacstmt->vacuum)
319
				CommitTransactionCommand(true);
320 321 322
			else
			{
				MemoryContextSwitchTo(old_context);
323
				MemoryContextResetAndDeleteChildren(anl_context);
324 325
			}
		}
326
	}
327

328 329 330
	/*
	 * Finish up processing.
	 */
331
	if (vacstmt->vacuum)
332
	{
333
		/* here, we are not in a transaction */
334

335
		/*
B
Bruce Momjian 已提交
336 337 338 339
		 * This matches the CommitTransaction waiting for us in
		 * PostgresMain(). We tell xact.c not to chain the upcoming
		 * commit, so that a VACUUM doesn't start a transaction block,
		 * even when autocommit is off.
340 341
		 */
		StartTransactionCommand(true);
342

343
		/*
T
ARGH!  
Tom Lane 已提交
344 345 346
		 * If we completed a database-wide VACUUM without skipping any
		 * relations, update the database's pg_database row with info
		 * about the transaction IDs used, and try to truncate pg_clog.
347
		 */
T
ARGH!  
Tom Lane 已提交
348
		if (all_rels)
349 350 351 352 353
		{
			vac_update_dbstats(MyDatabaseId,
							   initialOldestXmin, initialFreezeLimit);
			vac_truncate_clog(initialOldestXmin, initialFreezeLimit);
		}
354 355
	}

356 357
	/*
	 * Clean up working storage --- note we must do this after
B
Bruce Momjian 已提交
358 359
	 * StartTransactionCommand, else we might be trying to delete the
	 * active context!
360 361 362
	 */
	MemoryContextDelete(vac_context);
	vac_context = NULL;
363

364
	if (anl_context)
365
		MemoryContextDelete(anl_context);
366 367 368
}

/*
369
 * Build a list of Oids for each relation to be processed
370 371 372
 *
 * The list is built in vac_context so that it will survive across our
 * per-relation transactions.
373
 */
374 375
static List *
getrels(const RangeVar *vacrel, const char *stmttype)
376
{
377 378 379 380
	List	   *vrl = NIL;
	MemoryContext oldcontext;

	if (vacrel)
381
	{
382
		/* Process specific relation */
B
Bruce Momjian 已提交
383
		Oid			relid;
384 385 386 387 388 389 390

		relid = RangeVarGetRelid(vacrel, false);

		/* Make a relation list entry for this guy */
		oldcontext = MemoryContextSwitchTo(vac_context);
		vrl = lappendi(vrl, relid);
		MemoryContextSwitchTo(oldcontext);
391 392 393
	}
	else
	{
394 395 396 397 398
		/* Process all plain relations listed in pg_class */
		Relation	pgclass;
		HeapScanDesc scan;
		HeapTuple	tuple;
		ScanKeyData key;
399

400 401 402 403
		ScanKeyEntryInitialize(&key, 0x0,
							   Anum_pg_class_relkind,
							   F_CHAREQ,
							   CharGetDatum(RELKIND_RELATION));
404

405
		pgclass = heap_openr(RelationRelationName, AccessShareLock);
406

407
		scan = heap_beginscan(pgclass, SnapshotNow, 1, &key);
408

409
		while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
410
		{
411 412
			/* Make a relation list entry for this guy */
			oldcontext = MemoryContextSwitchTo(vac_context);
413
			vrl = lappendi(vrl, HeapTupleGetOid(tuple));
414
			MemoryContextSwitchTo(oldcontext);
415
		}
416

417 418
		heap_endscan(scan);
		heap_close(pgclass, AccessShareLock);
419 420
	}

421
	return vrl;
422 423
}

424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
/*
 * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points
 */
void
vacuum_set_xid_limits(VacuumStmt *vacstmt, bool sharedRel,
					  TransactionId *oldestXmin,
					  TransactionId *freezeLimit)
{
	TransactionId limit;

	*oldestXmin = GetOldestXmin(sharedRel);

	Assert(TransactionIdIsNormal(*oldestXmin));

	if (vacstmt->freeze)
	{
		/* FREEZE option: use oldest Xmin as freeze cutoff too */
		limit = *oldestXmin;
	}
	else
	{
		/*
		 * Normal case: freeze cutoff is well in the past, to wit, about
		 * halfway to the wrap horizon
		 */
		limit = GetCurrentTransactionId() - (MaxTransactionId >> 2);
	}

	/*
	 * Be careful not to generate a "permanent" XID
	 */
	if (!TransactionIdIsNormal(limit))
		limit = FirstNormalTransactionId;

	/*
	 * Ensure sane relationship of limits
	 */
	if (TransactionIdFollows(limit, *oldestXmin))
	{
B
Bruce Momjian 已提交
463
		elog(WARNING, "oldest Xmin is far in the past --- close open transactions soon to avoid wraparound problems");
464 465 466 467 468 469
		limit = *oldestXmin;
	}

	*freezeLimit = limit;
}

470

471
/*
472
 *	vac_update_relstats() -- update statistics for one relation
473
 *
474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
 *		Update the whole-relation statistics that are kept in its pg_class
 *		row.  There are additional stats that will be updated if we are
 *		doing ANALYZE, but we always update these stats.  This routine works
 *		for both index and heap relation entries in pg_class.
 *
 *		We violate no-overwrite semantics here by storing new values for the
 *		statistics columns directly into the pg_class tuple that's already on
 *		the page.  The reason for this is that if we updated these tuples in
 *		the usual way, vacuuming pg_class itself wouldn't work very well ---
 *		by the time we got done with a vacuum cycle, most of the tuples in
 *		pg_class would've been obsoleted.  Of course, this only works for
 *		fixed-size never-null columns, but these are.
 *
 *		This routine is shared by full VACUUM, lazy VACUUM, and stand-alone
 *		ANALYZE.
 */
void
vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
					bool hasindex)
{
	Relation	rd;
	HeapTupleData rtup;
	HeapTuple	ctup;
	Form_pg_class pgcform;
	Buffer		buffer;

	/*
	 * update number of tuples and number of pages in pg_class
	 */
	rd = heap_openr(RelationRelationName, RowExclusiveLock);

	ctup = SearchSysCache(RELOID,
						  ObjectIdGetDatum(relid),
						  0, 0, 0);
	if (!HeapTupleIsValid(ctup))
		elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
			 relid);

	/* get the buffer cache tuple */
	rtup.t_self = ctup->t_self;
	ReleaseSysCache(ctup);
515 516 517
	if (!heap_fetch(rd, SnapshotNow, &rtup, &buffer, false, NULL))
		elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
			 relid);
518 519 520 521 522 523

	/* overwrite the existing statistics in the tuple */
	pgcform = (Form_pg_class) GETSTRUCT(&rtup);
	pgcform->relpages = (int32) num_pages;
	pgcform->reltuples = num_tuples;
	pgcform->relhasindex = hasindex;
524

525
	/*
526 527
	 * If we have discovered that there are no indexes, then there's no
	 * primary key either.	This could be done more thoroughly...
528 529 530
	 */
	if (!hasindex)
		pgcform->relhaspkey = false;
531

532 533
	/*
	 * Invalidate the tuple in the catcaches; this also arranges to flush
B
Bruce Momjian 已提交
534 535 536
	 * the relation's relcache entry.  (If we fail to commit for some
	 * reason, no flush will occur, but no great harm is done since there
	 * are no noncritical state updates here.)
537
	 */
538
	CacheInvalidateHeapTuple(rd, &rtup);
539 540

	/* Write the buffer */
541 542 543 544 545 546
	WriteBuffer(buffer);

	heap_close(rd, RowExclusiveLock);
}


547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579
/*
 *	vac_update_dbstats() -- update statistics for one database
 *
 *		Update the whole-database statistics that are kept in its pg_database
 *		row.
 *
 *		We violate no-overwrite semantics here by storing new values for the
 *		statistics columns directly into the tuple that's already on the page.
 *		As with vac_update_relstats, this avoids leaving dead tuples behind
 *		after a VACUUM; which is good since GetRawDatabaseInfo
 *		can get confused by finding dead tuples in pg_database.
 *
 *		This routine is shared by full and lazy VACUUM.  Note that it is only
 *		applied after a database-wide VACUUM operation.
 */
static void
vac_update_dbstats(Oid dbid,
				   TransactionId vacuumXID,
				   TransactionId frozenXID)
{
	Relation	relation;
	ScanKeyData entry[1];
	HeapScanDesc scan;
	HeapTuple	tuple;
	Form_pg_database dbform;

	relation = heap_openr(DatabaseRelationName, RowExclusiveLock);

	/* Must use a heap scan, since there's no syscache for pg_database */
	ScanKeyEntryInitialize(&entry[0], 0x0,
						   ObjectIdAttributeNumber, F_OIDEQ,
						   ObjectIdGetDatum(dbid));

580
	scan = heap_beginscan(relation, SnapshotNow, 1, entry);
581

582
	tuple = heap_getnext(scan, ForwardScanDirection);
583 584 585 586 587 588 589 590 591 592 593

	if (!HeapTupleIsValid(tuple))
		elog(ERROR, "database %u does not exist", dbid);

	dbform = (Form_pg_database) GETSTRUCT(tuple);

	/* overwrite the existing statistics in the tuple */
	dbform->datvacuumxid = vacuumXID;
	dbform->datfrozenxid = frozenXID;

	/* invalidate the tuple in the cache and write the buffer */
594
	CacheInvalidateHeapTuple(relation, tuple);
595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611
	WriteNoReleaseBuffer(scan->rs_cbuf);

	heap_endscan(scan);

	heap_close(relation, RowExclusiveLock);
}


/*
 *	vac_truncate_clog() -- attempt to truncate the commit log
 *
 *		Scan pg_database to determine the system-wide oldest datvacuumxid,
 *		and use it to truncate the transaction commit log (pg_clog).
 *		Also generate a warning if the system-wide oldest datfrozenxid
 *		seems to be in danger of wrapping around.
 *
 *		The passed XIDs are simply the ones I just wrote into my pg_database
612
 *		entry.	They're used to initialize the "min" calculations.
613 614 615 616 617 618 619
 *
 *		This routine is shared by full and lazy VACUUM.  Note that it is only
 *		applied after a database-wide VACUUM operation.
 */
static void
vac_truncate_clog(TransactionId vacuumXID, TransactionId frozenXID)
{
620
	TransactionId myXID;
621 622 623 624
	Relation	relation;
	HeapScanDesc scan;
	HeapTuple	tuple;
	int32		age;
625 626 627 628
	bool		vacuumAlreadyWrapped = false;
	bool		frozenAlreadyWrapped = false;

	myXID = GetCurrentTransactionId();
629 630 631

	relation = heap_openr(DatabaseRelationName, AccessShareLock);

632
	scan = heap_beginscan(relation, SnapshotNow, 0, NULL);
633

634
	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
635 636 637 638 639 640 641 642
	{
		Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);

		/* Ignore non-connectable databases (eg, template0) */
		/* It's assumed that these have been frozen correctly */
		if (!dbform->datallowconn)
			continue;

643 644 645 646 647 648 649 650 651 652 653 654 655 656
		if (TransactionIdIsNormal(dbform->datvacuumxid))
		{
			if (TransactionIdPrecedes(myXID, dbform->datvacuumxid))
				vacuumAlreadyWrapped = true;
			else if (TransactionIdPrecedes(dbform->datvacuumxid, vacuumXID))
				vacuumXID = dbform->datvacuumxid;
		}
		if (TransactionIdIsNormal(dbform->datfrozenxid))
		{
			if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
				frozenAlreadyWrapped = true;
			else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
				frozenXID = dbform->datfrozenxid;
		}
657 658 659 660 661 662
	}

	heap_endscan(scan);

	heap_close(relation, AccessShareLock);

663
	/*
B
Bruce Momjian 已提交
664 665
	 * Do not truncate CLOG if we seem to have suffered wraparound
	 * already; the computed minimum XID might be bogus.
666 667 668 669 670 671 672 673
	 */
	if (vacuumAlreadyWrapped)
	{
		elog(WARNING, "Some databases have not been vacuumed in over 2 billion transactions."
			 "\n\tYou may have already suffered transaction-wraparound data loss.");
		return;
	}

674 675 676 677
	/* Truncate CLOG to the oldest vacuumxid */
	TruncateCLOG(vacuumXID);

	/* Give warning about impending wraparound problems */
678 679 680 681 682 683 684 685 686 687 688 689 690 691
	if (frozenAlreadyWrapped)
	{
		elog(WARNING, "Some databases have not been vacuumed in over 1 billion transactions."
			 "\n\tBetter vacuum them soon, or you may have a wraparound failure.");
	}
	else
	{
		age = (int32) (myXID - frozenXID);
		if (age > (int32) ((MaxTransactionId >> 3) * 3))
			elog(WARNING, "Some databases have not been vacuumed in %d transactions."
				 "\n\tBetter vacuum them within %d transactions,"
				 "\n\tor you may have a wraparound failure.",
				 age, (int32) (MaxTransactionId >> 1) - age);
	}
692 693 694
}


695 696 697 698 699 700 701 702 703 704
/****************************************************************************
 *																			*
 *			Code common to both flavors of VACUUM							*
 *																			*
 ****************************************************************************
 */


/*
 *	vacuum_rel() -- vacuum one heap relation
705
 *
T
ARGH!  
Tom Lane 已提交
706 707 708 709 710
 *		Returns TRUE if we actually processed the relation (or can ignore it
 *		for some reason), FALSE if we failed to process it due to permissions
 *		or other reasons.  (A FALSE result really means that some data
 *		may have been left unvacuumed, so we can't update XID stats.)
 *
711 712 713 714 715
 *		Doing one heap at a time incurs extra overhead, since we need to
 *		check that the heap exists again just before we vacuum it.	The
 *		reason that we do this is so that vacuuming can be spread across
 *		many small transactions.  Otherwise, two-phase locking would require
 *		us to lock the entire database during one pass of the vacuum cleaner.
716 717
 *
 *		At entry and exit, we are not inside a transaction.
718
 */
T
ARGH!  
Tom Lane 已提交
719
static bool
720
vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
721
{
722
	LOCKMODE	lmode;
723
	Relation	onerel;
724
	LockRelId	onerelid;
725
	Oid			toast_relid;
T
ARGH!  
Tom Lane 已提交
726
	bool		result;
727

728
	/* Begin a transaction for vacuuming this relation */
729
	StartTransactionCommand(true);
730
	SetQuerySnapshot();			/* might be needed for functional index */
731

732
	/*
B
Bruce Momjian 已提交
733
	 * Check for user-requested abort.	Note we want this to be inside a
B
Bruce Momjian 已提交
734
	 * transaction, so xact.c doesn't issue useless WARNING.
735
	 */
736
	CHECK_FOR_INTERRUPTS();
737

738 739 740 741
	/*
	 * Race condition -- if the pg_class tuple has gone away since the
	 * last time we saw it, we don't need to vacuum it.
	 */
742 743 744
	if (!SearchSysCacheExists(RELOID,
							  ObjectIdGetDatum(relid),
							  0, 0, 0))
745
	{
746
		CommitTransactionCommand(true);
T
ARGH!  
Tom Lane 已提交
747
		return true;			/* okay 'cause no data there */
748 749
	}

750
	/*
751 752
	 * Determine the type of lock we want --- hard exclusive lock for a
	 * FULL vacuum, but just ShareUpdateExclusiveLock for concurrent
753 754
	 * vacuum.	Either way, we can be sure that no other backend is
	 * vacuuming the same table.
755 756 757 758
	 */
	lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock;

	/*
759 760
	 * Open the class, get an appropriate lock on it, and check
	 * permissions.
761
	 *
762 763
	 * We allow the user to vacuum a table if he is superuser, the table
	 * owner, or the database owner (but in the latter case, only if it's
B
Bruce Momjian 已提交
764 765
	 * not a shared relation).	pg_class_ownercheck includes the superuser
	 * case.
766
	 *
B
Bruce Momjian 已提交
767
	 * Note we choose to treat permissions failure as a WARNING and keep
768
	 * trying to vacuum the rest of the DB --- is this appropriate?
769
	 */
770
	onerel = relation_open(relid, lmode);
771

772
	if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) ||
773
		  (is_dbadmin(MyDatabaseId) && !onerel->rd_rel->relisshared)))
774
	{
B
Bruce Momjian 已提交
775
		elog(WARNING, "Skipping \"%s\" --- only table or database owner can VACUUM it",
776
			 RelationGetRelationName(onerel));
777
		relation_close(onerel, lmode);
778
		CommitTransactionCommand(true);
T
ARGH!  
Tom Lane 已提交
779
		return false;
780 781 782 783 784 785 786 787 788 789 790
	}

	/*
	 * Check that it's a plain table; we used to do this in getrels() but
	 * seems safer to check after we've locked the relation.
	 */
	if (onerel->rd_rel->relkind != expected_relkind)
	{
		elog(WARNING, "Skipping \"%s\" --- can not process indexes, views or special system tables",
			 RelationGetRelationName(onerel));
		relation_close(onerel, lmode);
791
		CommitTransactionCommand(true);
T
ARGH!  
Tom Lane 已提交
792
		return false;
793 794
	}

795 796 797 798 799 800 801 802 803 804 805
	/*
	 * Silently ignore tables that are temp tables of other backends ---
	 * trying to vacuum these will lead to great unhappiness, since their
	 * contents are probably not up-to-date on disk.  (We don't throw a
	 * warning here; it would just lead to chatter during a database-wide
	 * VACUUM.)
	 */
	if (isOtherTempNamespace(RelationGetNamespace(onerel)))
	{
		relation_close(onerel, lmode);
		CommitTransactionCommand(true);
T
ARGH!  
Tom Lane 已提交
806
		return true;			/* assume no long-lived data in temp tables */
807 808
	}

809
	/*
810 811 812 813
	 * Get a session-level lock too. This will protect our access to the
	 * relation across multiple transactions, so that we can vacuum the
	 * relation's TOAST table (if any) secure in the knowledge that no one
	 * is deleting the parent relation.
814 815 816 817 818 819
	 *
	 * NOTE: this cannot block, even if someone else is waiting for access,
	 * because the lock manager knows that both lock requests are from the
	 * same process.
	 */
	onerelid = onerel->rd_lockInfo.lockRelId;
820
	LockRelationForSession(&onerelid, lmode);
821 822 823

	/*
	 * Remember the relation's TOAST relation for later
824 825 826
	 */
	toast_relid = onerel->rd_rel->reltoastrelid;

827 828 829 830
	/*
	 * Do the actual work --- either FULL or "lazy" vacuum
	 */
	if (vacstmt->full)
831
		full_vacuum_rel(onerel, vacstmt);
832
	else
833
		lazy_vacuum_rel(onerel, vacstmt);
834

T
ARGH!  
Tom Lane 已提交
835 836
	result = true;				/* did the vacuum */

837
	/* all done with this class, but hold lock until commit */
838
	relation_close(onerel, NoLock);
839 840 841 842

	/*
	 * Complete the transaction and free all temporary memory used.
	 */
843
	CommitTransactionCommand(true);
844 845 846 847

	/*
	 * If the relation has a secondary toast rel, vacuum that too while we
	 * still hold the session lock on the master table.  Note however that
848 849 850
	 * "analyze" will not get done on the toast table.	This is good,
	 * because the toaster always uses hardcoded index access and
	 * statistics are totally unimportant for toast relations.
851 852
	 */
	if (toast_relid != InvalidOid)
T
ARGH!  
Tom Lane 已提交
853 854 855 856
	{
		if (! vacuum_rel(toast_relid, vacstmt, RELKIND_TOASTVALUE))
			result = false;		/* failed to vacuum the TOAST table? */
	}
857 858 859 860 861

	/*
	 * Now release the session-level lock on the master table.
	 */
	UnlockRelationForSession(&onerelid, lmode);
T
ARGH!  
Tom Lane 已提交
862 863

	return result;
864 865 866 867 868 869 870 871 872 873 874 875 876 877
}


/****************************************************************************
 *																			*
 *			Code for VACUUM FULL (only)										*
 *																			*
 ****************************************************************************
 */


/*
 *	full_vacuum_rel() -- perform FULL VACUUM for one heap relation
 *
878
 *		This routine vacuums a single heap, cleans out its indexes, and
879 880 881 882 883 884
 *		updates its num_pages and num_tuples statistics.
 *
 *		At entry, we have already established a transaction and opened
 *		and locked the relation.
 */
static void
885
full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
886 887
{
	VacPageListData vacuum_pages;		/* List of pages to vacuum and/or
888
										 * clean indexes */
889 890 891
	VacPageListData fraged_pages;		/* List of pages with space enough
										 * for re-using */
	Relation   *Irel;
892
	int			nindexes,
893 894 895 896 897
				i;
	VRelStats  *vacrelstats;
	bool		reindex = false;

	if (IsIgnoringSystemIndexes() &&
898
		IsSystemRelation(onerel))
899 900
		reindex = true;

901 902
	vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
						  &OldestXmin, &FreezeLimit);
903

904 905 906
	/*
	 * Set up statistics-gathering machinery.
	 */
907
	vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
908 909
	vacrelstats->rel_pages = 0;
	vacrelstats->rel_tuples = 0;
910
	vacrelstats->hasindex = false;
B
Bruce Momjian 已提交
911

912
	/* scan the heap */
B
Bruce Momjian 已提交
913
	vacuum_pages.num_pages = fraged_pages.num_pages = 0;
B
Bruce Momjian 已提交
914
	scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
915

916 917
	/* Now open all indexes of the relation */
	vac_open_indexes(onerel, &nindexes, &Irel);
H
Hiroshi Inoue 已提交
918 919 920 921
	if (!Irel)
		reindex = false;
	else if (!RelationGetForm(onerel)->relhasindex)
		reindex = true;
922
	if (nindexes > 0)
923
		vacrelstats->hasindex = true;
B
Bruce Momjian 已提交
924

925
#ifdef NOT_USED
926

927
	/*
B
Bruce Momjian 已提交
928 929
	 * reindex in VACUUM is dangerous under WAL. ifdef out until it
	 * becomes safe.
930
	 */
H
Hiroshi Inoue 已提交
931 932
	if (reindex)
	{
933
		vac_close_indexes(nindexes, Irel);
H
Hiroshi Inoue 已提交
934
		Irel = (Relation *) NULL;
935
		activate_indexes_of_a_table(onerel, false);
H
Hiroshi Inoue 已提交
936
	}
937
#endif   /* NOT_USED */
938 939 940 941

	/* Clean/scan index relation(s) */
	if (Irel != (Relation *) NULL)
	{
B
Bruce Momjian 已提交
942
		if (vacuum_pages.num_pages > 0)
943
		{
944
			for (i = 0; i < nindexes; i++)
945
				vacuum_index(&vacuum_pages, Irel[i],
946
							 vacrelstats->rel_tuples, 0);
947 948 949
		}
		else
		{
950 951
			/* just scan indexes to update statistic */
			for (i = 0; i < nindexes; i++)
952
				scan_index(Irel[i], vacrelstats->rel_tuples);
953 954 955
		}
	}

956 957 958 959
	if (fraged_pages.num_pages > 0)
	{
		/* Try to shrink heap */
		repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
960 961
					nindexes, Irel);
		vac_close_indexes(nindexes, Irel);
962
	}
963 964
	else
	{
965
		vac_close_indexes(nindexes, Irel);
966 967 968
		if (vacuum_pages.num_pages > 0)
		{
			/* Clean pages from vacuum_pages list */
B
Bruce Momjian 已提交
969
			vacuum_heap(vacrelstats, onerel, &vacuum_pages);
970 971 972 973 974 975 976 977 978
		}
		else
		{
			/*
			 * Flush dirty pages out to disk.  We must do this even if we
			 * didn't do anything else, because we want to ensure that all
			 * tuples have correct on-row commit status on disk (see
			 * bufmgr.c's comments for FlushRelationBuffers()).
			 */
979
			i = FlushRelationBuffers(onerel, vacrelstats->rel_pages);
980
			if (i < 0)
981
				elog(ERROR, "VACUUM (full_vacuum_rel): FlushRelationBuffers returned %d",
982 983
					 i);
		}
984
	}
985

B
Bruce Momjian 已提交
986
#ifdef NOT_USED
H
Hiroshi Inoue 已提交
987
	if (reindex)
988
		activate_indexes_of_a_table(onerel, true);
989
#endif   /* NOT_USED */
990

991 992 993
	/* update shared free space map with final free space info */
	vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);

994
	/* update statistics in pg_class */
995
	vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
996
						vacrelstats->rel_tuples, vacrelstats->hasindex);
997 998
}

999

1000
/*
B
Bruce Momjian 已提交
1001
 *	scan_heap() -- scan an open heap relation
1002
 *
1003 1004 1005 1006
 *		This routine sets commit status bits, constructs vacuum_pages (list
 *		of pages we need to compact free space on and/or clean indexes of
 *		deleted tuples), constructs fraged_pages (list of pages with free
 *		space that tuples could be moved into), and calculates statistics
1007
 *		on the number of live tuples in the heap.
1008 1009
 */
static void
B
Bruce Momjian 已提交
1010
scan_heap(VRelStats *vacrelstats, Relation onerel,
B
Bruce Momjian 已提交
1011
		  VacPageList vacuum_pages, VacPageList fraged_pages)
1012
{
1013
	BlockNumber nblocks,
1014 1015 1016
				blkno;
	ItemId		itemid;
	Buffer		buf;
B
Bruce Momjian 已提交
1017
	HeapTupleData tuple;
1018 1019 1020 1021 1022 1023
	OffsetNumber offnum,
				maxoff;
	bool		pgchanged,
				tupgone,
				notup;
	char	   *relname;
B
Bruce Momjian 已提交
1024
	VacPage		vacpage,
1025
				vacpagecopy;
1026
	BlockNumber empty_pages,
B
Bruce Momjian 已提交
1027 1028
				new_pages,
				changed_pages,
B
Bruce Momjian 已提交
1029
				empty_end_pages;
1030 1031 1032
	double		num_tuples,
				tups_vacuumed,
				nkeep,
1033
				nunused;
1034
	double		free_size,
B
Bruce Momjian 已提交
1035
				usable_free_size;
1036
	Size		min_tlen = MaxTupleSize;
1037
	Size		max_tlen = 0;
1038
	int			i;
1039
	bool		do_shrinking = true;
1040 1041 1042
	VTupleLink	vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
	int			num_vtlinks = 0;
	int			free_vtlinks = 100;
1043
	VacRUsage	ru0;
1044

1045
	vac_init_rusage(&ru0);
1046

1047
	relname = RelationGetRelationName(onerel);
1048 1049 1050
	elog(elevel, "--Relation %s.%s--",
		 get_namespace_name(RelationGetNamespace(onerel)),
		 relname);
1051

1052
	empty_pages = new_pages = changed_pages = empty_end_pages = 0;
1053
	num_tuples = tups_vacuumed = nkeep = nunused = 0;
1054
	free_size = 0;
1055 1056 1057

	nblocks = RelationGetNumberOfBlocks(onerel);

1058 1059 1060 1061
	/*
	 * We initially create each VacPage item in a maximal-sized workspace,
	 * then copy the workspace into a just-large-enough copy.
	 */
B
Bruce Momjian 已提交
1062
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1063 1064 1065

	for (blkno = 0; blkno < nblocks; blkno++)
	{
1066 1067 1068 1069 1070
		Page		page,
					tempPage = NULL;
		bool		do_reap,
					do_frag;

1071 1072
		CHECK_FOR_INTERRUPTS();

1073 1074
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);
1075

B
Bruce Momjian 已提交
1076
		vacpage->blkno = blkno;
1077
		vacpage->offsets_used = 0;
B
Bruce Momjian 已提交
1078
		vacpage->offsets_free = 0;
1079

1080 1081
		if (PageIsNew(page))
		{
B
Bruce Momjian 已提交
1082
			elog(WARNING, "Rel %s: Uninitialized page %u - fixing",
1083 1084
				 relname, blkno);
			PageInit(page, BufferGetPageSize(buf), 0);
B
Bruce Momjian 已提交
1085 1086
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
			free_size += (vacpage->free - sizeof(ItemIdData));
B
Bruce Momjian 已提交
1087
			new_pages++;
B
Bruce Momjian 已提交
1088
			empty_end_pages++;
1089 1090 1091
			vacpagecopy = copy_vac_page(vacpage);
			vpage_insert(vacuum_pages, vacpagecopy);
			vpage_insert(fraged_pages, vacpagecopy);
1092 1093
			WriteBuffer(buf);
			continue;
V
Vadim B. Mikheev 已提交
1094
		}
1095 1096

		if (PageIsEmpty(page))
1097
		{
B
Bruce Momjian 已提交
1098 1099
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
			free_size += (vacpage->free - sizeof(ItemIdData));
B
Bruce Momjian 已提交
1100
			empty_pages++;
B
Bruce Momjian 已提交
1101
			empty_end_pages++;
1102 1103 1104
			vacpagecopy = copy_vac_page(vacpage);
			vpage_insert(vacuum_pages, vacpagecopy);
			vpage_insert(fraged_pages, vacpagecopy);
1105 1106
			ReleaseBuffer(buf);
			continue;
1107 1108
		}

1109 1110 1111 1112 1113 1114
		pgchanged = false;
		notup = true;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
1115
		{
1116 1117
			uint16		sv_infomask;

1118 1119 1120
			itemid = PageGetItemId(page, offnum);

			/*
1121
			 * Collect un-used items too - it's possible to have indexes
1122 1123 1124 1125
			 * pointing here after crash.
			 */
			if (!ItemIdIsUsed(itemid))
			{
B
Bruce Momjian 已提交
1126
				vacpage->offsets[vacpage->offsets_free++] = offnum;
1127
				nunused += 1;
1128 1129 1130
				continue;
			}

1131
			tuple.t_datamcxt = NULL;
1132 1133 1134
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
1135

1136 1137
			tupgone = false;
			sv_infomask = tuple.t_data->t_infomask;
1138

1139
			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin))
1140
			{
1141
				case HEAPTUPLE_DEAD:
1142
					tupgone = true;		/* we can delete the tuple */
1143 1144
					break;
				case HEAPTUPLE_LIVE:
1145

1146
					/*
1147 1148
					 * Tuple is good.  Consider whether to replace its
					 * xmin value with FrozenTransactionId.
1149
					 */
1150 1151
					if (TransactionIdIsNormal(HeapTupleHeaderGetXmin(tuple.t_data)) &&
						TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
1152 1153
											  FreezeLimit))
					{
1154
						HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
T
Tom Lane 已提交
1155 1156
						/* infomask should be okay already */
						Assert(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED);
1157 1158
						pgchanged = true;
					}
1159 1160
					break;
				case HEAPTUPLE_RECENTLY_DEAD:
1161

1162
					/*
1163 1164
					 * If tuple is recently deleted then we must not
					 * remove it from relation.
1165
					 */
1166
					nkeep += 1;
1167

1168 1169 1170 1171 1172
					/*
					 * If we do shrinking and this tuple is updated one
					 * then remember it to construct updated tuple
					 * dependencies.
					 */
1173 1174 1175
					if (do_shrinking &&
						!(ItemPointerEquals(&(tuple.t_self),
											&(tuple.t_data->t_ctid))))
1176 1177 1178 1179
					{
						if (free_vtlinks == 0)
						{
							free_vtlinks = 1000;
B
Bruce Momjian 已提交
1180 1181 1182
							vtlinks = (VTupleLink) repalloc(vtlinks,
										   (free_vtlinks + num_vtlinks) *
												 sizeof(VTupleLinkData));
1183 1184 1185 1186 1187 1188
						}
						vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
						vtlinks[num_vtlinks].this_tid = tuple.t_self;
						free_vtlinks--;
						num_vtlinks++;
					}
1189 1190
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:
1191

1192
					/*
1193 1194
					 * This should not happen, since we hold exclusive
					 * lock on the relation; shouldn't we raise an error?
1195
					 */
B
Bruce Momjian 已提交
1196
					elog(WARNING, "Rel %s: TID %u/%u: InsertTransactionInProgress %u - can't shrink relation",
1197
						 relname, blkno, offnum, HeapTupleHeaderGetXmin(tuple.t_data));
1198 1199 1200
					do_shrinking = false;
					break;
				case HEAPTUPLE_DELETE_IN_PROGRESS:
1201

1202
					/*
1203 1204
					 * This should not happen, since we hold exclusive
					 * lock on the relation; shouldn't we raise an error?
1205
					 */
B
Bruce Momjian 已提交
1206
					elog(WARNING, "Rel %s: TID %u/%u: DeleteTransactionInProgress %u - can't shrink relation",
1207
						 relname, blkno, offnum, HeapTupleHeaderGetXmax(tuple.t_data));
1208 1209 1210 1211 1212
					do_shrinking = false;
					break;
				default:
					elog(ERROR, "Unexpected HeapTupleSatisfiesVacuum result");
					break;
1213 1214
			}

1215 1216 1217 1218
			/* check for hint-bit update by HeapTupleSatisfiesVacuum */
			if (sv_infomask != tuple.t_data->t_infomask)
				pgchanged = true;

1219 1220 1221
			/*
			 * Other checks...
			 */
1222 1223
			if (onerel->rd_rel->relhasoids &&
				!OidIsValid(HeapTupleGetOid(&tuple)))
B
Bruce Momjian 已提交
1224
				elog(WARNING, "Rel %s: TID %u/%u: OID IS INVALID. TUPGONE %d.",
1225
					 relname, blkno, offnum, (int) tupgone);
1226 1227 1228

			if (tupgone)
			{
1229
				ItemId		lpp;
1230

1231 1232 1233 1234 1235
				/*
				 * Here we are building a temporary copy of the page with
				 * dead tuples removed.  Below we will apply
				 * PageRepairFragmentation to the copy, so that we can
				 * determine how much space will be available after
B
Bruce Momjian 已提交
1236
				 * removal of dead tuples.	But note we are NOT changing
1237 1238
				 * the real page yet...
				 */
1239 1240
				if (tempPage == (Page) NULL)
				{
1241
					Size		pageSize;
1242 1243 1244

					pageSize = PageGetPageSize(page);
					tempPage = (Page) palloc(pageSize);
1245
					memcpy(tempPage, page, pageSize);
1246 1247
				}

1248
				/* mark it unused on the temp page */
1249
				lpp = PageGetItemId(tempPage, offnum);
1250 1251
				lpp->lp_flags &= ~LP_USED;

B
Bruce Momjian 已提交
1252
				vacpage->offsets[vacpage->offsets_free++] = offnum;
1253
				tups_vacuumed += 1;
1254 1255 1256
			}
			else
			{
1257
				num_tuples += 1;
1258
				notup = false;
1259 1260 1261 1262
				if (tuple.t_len < min_tlen)
					min_tlen = tuple.t_len;
				if (tuple.t_len > max_tlen)
					max_tlen = tuple.t_len;
1263
			}
1264
		}						/* scan along page */
1265

B
Bruce Momjian 已提交
1266
		if (tempPage != (Page) NULL)
1267 1268
		{
			/* Some tuples are removable; figure free space after removal */
1269
			PageRepairFragmentation(tempPage, NULL);
B
Bruce Momjian 已提交
1270
			vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
1271
			pfree(tempPage);
1272
			do_reap = true;
1273
		}
1274 1275 1276
		else
		{
			/* Just use current available space */
B
Bruce Momjian 已提交
1277
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1278 1279
			/* Need to reap the page if it has ~LP_USED line pointers */
			do_reap = (vacpage->offsets_free > 0);
V
Vadim B. Mikheev 已提交
1280
		}
1281

1282
		free_size += vacpage->free;
1283

1284 1285
		/*
		 * Add the page to fraged_pages if it has a useful amount of free
1286 1287 1288
		 * space.  "Useful" means enough for a minimal-sized tuple. But we
		 * don't know that accurately near the start of the relation, so
		 * add pages unconditionally if they have >= BLCKSZ/10 free space.
1289
		 */
1290
		do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10);
1291 1292

		if (do_reap || do_frag)
1293
		{
1294 1295 1296 1297 1298
			vacpagecopy = copy_vac_page(vacpage);
			if (do_reap)
				vpage_insert(vacuum_pages, vacpagecopy);
			if (do_frag)
				vpage_insert(fraged_pages, vacpagecopy);
1299 1300
		}

1301
		if (notup)
B
Bruce Momjian 已提交
1302
			empty_end_pages++;
1303
		else
B
Bruce Momjian 已提交
1304
			empty_end_pages = 0;
1305 1306 1307 1308 1309 1310 1311 1312

		if (pgchanged)
		{
			WriteBuffer(buf);
			changed_pages++;
		}
		else
			ReleaseBuffer(buf);
1313 1314
	}

B
Bruce Momjian 已提交
1315
	pfree(vacpage);
1316 1317

	/* save stats in the rel list for use later */
1318 1319
	vacrelstats->rel_tuples = num_tuples;
	vacrelstats->rel_pages = nblocks;
B
Bruce Momjian 已提交
1320
	if (num_tuples == 0)
1321 1322 1323 1324
		min_tlen = max_tlen = 0;
	vacrelstats->min_tlen = min_tlen;
	vacrelstats->max_tlen = max_tlen;

B
Bruce Momjian 已提交
1325 1326
	vacuum_pages->empty_end_pages = empty_end_pages;
	fraged_pages->empty_end_pages = empty_end_pages;
1327 1328

	/*
1329 1330 1331
	 * Clear the fraged_pages list if we found we couldn't shrink. Else,
	 * remove any "empty" end-pages from the list, and compute usable free
	 * space = free space in remaining pages.
1332
	 */
1333
	if (do_shrinking)
1334
	{
1335 1336 1337 1338 1339 1340 1341 1342 1343 1344
		Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
		fraged_pages->num_pages -= empty_end_pages;
		usable_free_size = 0;
		for (i = 0; i < fraged_pages->num_pages; i++)
			usable_free_size += fraged_pages->pagedesc[i]->free;
	}
	else
	{
		fraged_pages->num_pages = 0;
		usable_free_size = 0;
V
Vadim B. Mikheev 已提交
1345
	}
1346

1347 1348
	/* don't bother to save vtlinks if we will not call repair_frag */
	if (fraged_pages->num_pages > 0 && num_vtlinks > 0)
1349
	{
B
Bruce Momjian 已提交
1350
		qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
B
Bruce Momjian 已提交
1351
			  vac_cmp_vtlinks);
1352 1353 1354 1355 1356 1357 1358 1359 1360 1361
		vacrelstats->vtlinks = vtlinks;
		vacrelstats->num_vtlinks = num_vtlinks;
	}
	else
	{
		vacrelstats->vtlinks = NULL;
		vacrelstats->num_vtlinks = 0;
		pfree(vtlinks);
	}

1362 1363 1364 1365
	elog(elevel, "Pages %u: Changed %u, reaped %u, Empty %u, New %u; "
		 "Tup %.0f: Vac %.0f, Keep/VTL %.0f/%u, UnUsed %.0f, MinLen %lu, "
		 "MaxLen %lu; Re-using: Free/Avail. Space %.0f/%.0f; "
		 "EndEmpty/Avail. Pages %u/%u.\n\t%s",
B
Bruce Momjian 已提交
1366
		 nblocks, changed_pages, vacuum_pages->num_pages, empty_pages,
B
Bruce Momjian 已提交
1367
		 new_pages, num_tuples, tups_vacuumed,
1368
		 nkeep, vacrelstats->num_vtlinks,
B
Bruce Momjian 已提交
1369
		 nunused, (unsigned long) min_tlen, (unsigned long) max_tlen,
1370
		 free_size, usable_free_size,
B
Bruce Momjian 已提交
1371
		 empty_end_pages, fraged_pages->num_pages,
1372
		 vac_show_rusage(&ru0));
B
Bruce Momjian 已提交
1373
}
V
Vadim B. Mikheev 已提交
1374

1375 1376

/*
B
Bruce Momjian 已提交
1377
 *	repair_frag() -- try to repair relation's fragmentation
1378
 *
1379
 *		This routine marks dead tuples as unused and tries re-use dead space
1380 1381
 *		by moving tuples (and inserting indexes if needed). It constructs
 *		Nvacpagelist list of free-ed pages (moved tuples) and clean indexes
1382 1383 1384
 *		for them after committing (in hack-manner - without losing locks
 *		and freeing memory!) current transaction. It truncates relation
 *		if some end-blocks are gone away.
1385 1386
 */
static void
B
Bruce Momjian 已提交
1387
repair_frag(VRelStats *vacrelstats, Relation onerel,
B
Bruce Momjian 已提交
1388
			VacPageList vacuum_pages, VacPageList fraged_pages,
1389
			int nindexes, Relation *Irel)
1390
{
1391 1392 1393
	TransactionId myXID;
	CommandId	myCID;
	Buffer		buf,
B
Bruce Momjian 已提交
1394
				cur_buffer;
1395
	BlockNumber nblocks,
1396
				blkno;
1397
	BlockNumber last_move_dest_block = 0,
1398
				last_vacuum_block;
1399 1400
	Page		page,
				ToPage = NULL;
1401 1402
	OffsetNumber offnum,
				maxoff,
1403
				newoff,
B
Bruce Momjian 已提交
1404
				max_offset;
1405 1406
	ItemId		itemid,
				newitemid;
B
Bruce Momjian 已提交
1407 1408
	HeapTupleData tuple,
				newtup;
1409
	TupleDesc	tupdesc;
1410 1411 1412 1413
	ResultRelInfo *resultRelInfo;
	EState	   *estate;
	TupleTable	tupleTable;
	TupleTableSlot *slot;
B
Bruce Momjian 已提交
1414 1415
	VacPageListData Nvacpagelist;
	VacPage		cur_page = NULL,
B
Bruce Momjian 已提交
1416
				last_vacuum_page,
B
Bruce Momjian 已提交
1417 1418
				vacpage,
			   *curpage;
B
Bruce Momjian 已提交
1419
	int			cur_item = 0;
1420
	int			i;
B
Bruce Momjian 已提交
1421 1422 1423 1424
	Size		tuple_len;
	int			num_moved,
				num_fraged_pages,
				vacuumed_pages;
B
Bruce Momjian 已提交
1425
	int			checked_moved,
1426 1427
				num_tuples,
				keep_tuples = 0;
1428
	bool		isempty,
1429 1430
				dowrite,
				chain_tuple_moved;
1431
	VacRUsage	ru0;
1432

1433
	vac_init_rusage(&ru0);
1434 1435 1436 1437

	myXID = GetCurrentTransactionId();
	myCID = GetCurrentCommandId();

1438 1439
	tupdesc = RelationGetDescr(onerel);

1440 1441 1442 1443 1444 1445 1446
	/*
	 * We need a ResultRelInfo and an EState so we can use the regular
	 * executor's index-entry-making machinery.
	 */
	resultRelInfo = makeNode(ResultRelInfo);
	resultRelInfo->ri_RangeTableIndex = 1;		/* dummy */
	resultRelInfo->ri_RelationDesc = onerel;
1447
	resultRelInfo->ri_TrigDesc = NULL;	/* we don't fire triggers */
1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459

	ExecOpenIndices(resultRelInfo);

	estate = CreateExecutorState();
	estate->es_result_relations = resultRelInfo;
	estate->es_num_result_relations = 1;
	estate->es_result_relation_info = resultRelInfo;

	/* Set up a dummy tuple table too */
	tupleTable = ExecCreateTupleTable(1);
	slot = ExecAllocTableSlot(tupleTable);
	ExecSetSlotDescriptor(slot, tupdesc, false);
1460

B
Bruce Momjian 已提交
1461 1462
	Nvacpagelist.num_pages = 0;
	num_fraged_pages = fraged_pages->num_pages;
1463
	Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
B
Bruce Momjian 已提交
1464
	vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475
	if (vacuumed_pages > 0)
	{
		/* get last reaped page from vacuum_pages */
		last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
		last_vacuum_block = last_vacuum_page->blkno;
	}
	else
	{
		last_vacuum_page = NULL;
		last_vacuum_block = InvalidBlockNumber;
	}
B
Bruce Momjian 已提交
1476 1477
	cur_buffer = InvalidBuffer;
	num_moved = 0;
1478

B
Bruce Momjian 已提交
1479 1480
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
	vacpage->offsets_used = vacpage->offsets_free = 0;
1481

1482 1483
	/*
	 * Scan pages backwards from the last nonempty page, trying to move
B
Bruce Momjian 已提交
1484
	 * tuples down to lower pages.	Quit when we reach a page that we have
1485 1486 1487
	 * moved any tuples onto, or the first page if we haven't moved
	 * anything, or when we find a page we cannot completely empty (this
	 * last condition is handled by "break" statements within the loop).
1488
	 *
B
Bruce Momjian 已提交
1489
	 * NB: this code depends on the vacuum_pages and fraged_pages lists being
1490
	 * in order by blkno.
1491
	 */
1492
	nblocks = vacrelstats->rel_pages;
B
Bruce Momjian 已提交
1493
	for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1494 1495
		 blkno > last_move_dest_block;
		 blkno--)
V
Vadim B. Mikheev 已提交
1496
	{
1497 1498
		CHECK_FOR_INTERRUPTS();

1499
		/*
1500 1501 1502 1503 1504
		 * Forget fraged_pages pages at or after this one; they're no
		 * longer useful as move targets, since we only want to move down.
		 * Note that since we stop the outer loop at last_move_dest_block,
		 * pages removed here cannot have had anything moved onto them
		 * already.
1505
		 *
1506 1507 1508
		 * Also note that we don't change the stored fraged_pages list, only
		 * our local variable num_fraged_pages; so the forgotten pages are
		 * still available to be loaded into the free space map later.
1509 1510
		 */
		while (num_fraged_pages > 0 &&
1511
			fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno)
1512
		{
1513
			Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0);
1514 1515 1516 1517 1518 1519
			--num_fraged_pages;
		}

		/*
		 * Process this page of relation.
		 */
1520 1521 1522
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);

B
Bruce Momjian 已提交
1523
		vacpage->offsets_free = 0;
1524 1525 1526 1527

		isempty = PageIsEmpty(page);

		dowrite = false;
1528 1529 1530

		/* Is the page in the vacuum_pages list? */
		if (blkno == last_vacuum_block)
1531
		{
1532 1533 1534
			if (last_vacuum_page->offsets_free > 0)
			{
				/* there are dead tuples on this page - clean them */
1535
				Assert(!isempty);
1536 1537 1538
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				vacuum_page(onerel, buf, last_vacuum_page);
				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1539 1540 1541 1542
				dowrite = true;
			}
			else
				Assert(isempty);
B
Bruce Momjian 已提交
1543
			--vacuumed_pages;
1544 1545
			if (vacuumed_pages > 0)
			{
B
Bruce Momjian 已提交
1546
				/* get prev reaped page from vacuum_pages */
B
Bruce Momjian 已提交
1547 1548
				last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
				last_vacuum_block = last_vacuum_page->blkno;
1549 1550
			}
			else
1551
			{
1552
				last_vacuum_page = NULL;
1553
				last_vacuum_block = InvalidBlockNumber;
1554
			}
1555 1556 1557 1558 1559 1560 1561 1562 1563
			if (isempty)
			{
				ReleaseBuffer(buf);
				continue;
			}
		}
		else
			Assert(!isempty);

B
Bruce Momjian 已提交
1564 1565
		chain_tuple_moved = false;		/* no one chain-tuple was moved
										 * off this page, yet */
B
Bruce Momjian 已提交
1566
		vacpage->blkno = blkno;
1567 1568 1569 1570 1571 1572 1573 1574 1575 1576
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			itemid = PageGetItemId(page, offnum);

			if (!ItemIdIsUsed(itemid))
				continue;

1577
			tuple.t_datamcxt = NULL;
1578 1579 1580
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple_len = tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
1581

1582 1583 1584
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
			{
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1585
					elog(ERROR, "HEAP_MOVED_IN was not expected");
B
Bruce Momjian 已提交
1586 1587 1588

				/*
				 * If this (chain) tuple is moved by me already then I
B
Bruce Momjian 已提交
1589 1590
				 * have to check is it in vacpage or not - i.e. is it
				 * moved while cleaning this page or some previous one.
1591 1592 1593
				 */
				if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
1594 1595
					if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
						elog(ERROR, "Invalid XVAC in tuple header");
1596 1597
					if (keep_tuples == 0)
						continue;
B
Bruce Momjian 已提交
1598 1599 1600
					if (chain_tuple_moved)		/* some chains was moved
												 * while */
					{			/* cleaning this page */
B
Bruce Momjian 已提交
1601 1602
						Assert(vacpage->offsets_free > 0);
						for (i = 0; i < vacpage->offsets_free; i++)
1603
						{
B
Bruce Momjian 已提交
1604
							if (vacpage->offsets[i] == offnum)
1605 1606
								break;
						}
B
Bruce Momjian 已提交
1607
						if (i >= vacpage->offsets_free) /* not found */
1608
						{
B
Bruce Momjian 已提交
1609
							vacpage->offsets[vacpage->offsets_free++] = offnum;
1610 1611 1612 1613 1614
							keep_tuples--;
						}
					}
					else
					{
B
Bruce Momjian 已提交
1615
						vacpage->offsets[vacpage->offsets_free++] = offnum;
1616 1617 1618 1619 1620
						keep_tuples--;
					}
					continue;
				}
				elog(ERROR, "HEAP_MOVED_OFF was expected");
1621 1622 1623
			}

			/*
B
Bruce Momjian 已提交
1624 1625 1626
			 * If this tuple is in the chain of tuples created in updates
			 * by "recent" transactions then we have to move all chain of
			 * tuples to another places.
1627
			 *
B
Bruce Momjian 已提交
1628 1629 1630
			 * NOTE: this test is not 100% accurate: it is possible for a
			 * tuple to be an updated one with recent xmin, and yet not
			 * have a corresponding tuple in the vtlinks list.	Presumably
1631 1632
			 * there was once a parent tuple with xmax matching the xmin,
			 * but it's possible that that tuple has been removed --- for
B
Bruce Momjian 已提交
1633 1634 1635
			 * example, if it had xmin = xmax then
			 * HeapTupleSatisfiesVacuum would deem it removable as soon as
			 * the xmin xact completes.
1636 1637
			 *
			 * To be on the safe side, we abandon the repair_frag process if
B
Bruce Momjian 已提交
1638 1639 1640
			 * we cannot find the parent tuple in vtlinks.	This may be
			 * overly conservative; AFAICS it would be safe to move the
			 * chain.
1641
			 */
1642
			if (((tuple.t_data->t_infomask & HEAP_UPDATED) &&
1643
			 !TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
B
Bruce Momjian 已提交
1644
									OldestXmin)) ||
1645 1646
				(!(tuple.t_data->t_infomask & (HEAP_XMAX_INVALID |
											   HEAP_MARKED_FOR_UPDATE)) &&
1647 1648
				 !(ItemPointerEquals(&(tuple.t_self),
									 &(tuple.t_data->t_ctid)))))
1649
			{
B
Bruce Momjian 已提交
1650
				Buffer		Cbuf = buf;
1651 1652
				bool		freeCbuf = false;
				bool		chain_move_failed = false;
B
Bruce Momjian 已提交
1653 1654 1655 1656 1657
				Page		Cpage;
				ItemId		Citemid;
				ItemPointerData Ctid;
				HeapTupleData tp = tuple;
				Size		tlen = tuple_len;
1658 1659 1660
				VTupleMove	vtmove;
				int			num_vtmove;
				int			free_vtmove;
B
Bruce Momjian 已提交
1661
				VacPage		to_vacpage = NULL;
B
Bruce Momjian 已提交
1662 1663
				int			to_item = 0;
				int			ti;
1664 1665 1666 1667 1668 1669

				if (cur_buffer != InvalidBuffer)
				{
					WriteBuffer(cur_buffer);
					cur_buffer = InvalidBuffer;
				}
B
Bruce Momjian 已提交
1670

1671 1672 1673 1674
				/* Quick exit if we have no vtlinks to search in */
				if (vacrelstats->vtlinks == NULL)
				{
					elog(WARNING, "Parent item in update-chain not found - can't continue repair_frag");
B
Bruce Momjian 已提交
1675
					break;		/* out of walk-along-page loop */
1676 1677 1678 1679 1680 1681
				}

				vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
				num_vtmove = 0;
				free_vtmove = 100;

1682
				/*
B
Bruce Momjian 已提交
1683 1684
				 * If this tuple is in the begin/middle of the chain then
				 * we have to move to the end of chain.
1685
				 */
1686
				while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
B
Bruce Momjian 已提交
1687
											  HEAP_MARKED_FOR_UPDATE)) &&
1688 1689
					   !(ItemPointerEquals(&(tp.t_self),
										   &(tp.t_data->t_ctid))))
1690 1691 1692 1693 1694
				{
					Ctid = tp.t_data->t_ctid;
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					freeCbuf = true;
B
Bruce Momjian 已提交
1695 1696
					Cbuf = ReadBuffer(onerel,
									  ItemPointerGetBlockNumber(&Ctid));
1697
					Cpage = BufferGetPage(Cbuf);
B
Bruce Momjian 已提交
1698 1699
					Citemid = PageGetItemId(Cpage,
									  ItemPointerGetOffsetNumber(&Ctid));
1700
					if (!ItemIdIsUsed(Citemid))
1701 1702
					{
						/*
B
Bruce Momjian 已提交
1703
						 * This means that in the middle of chain there
1704
						 * was tuple updated by older (than OldestXmin)
B
Bruce Momjian 已提交
1705 1706 1707
						 * xaction and this tuple is already deleted by
						 * me. Actually, upper part of chain should be
						 * removed and seems that this should be handled
B
Bruce Momjian 已提交
1708 1709
						 * in scan_heap(), but it's not implemented at the
						 * moment and so we just stop shrinking here.
1710
						 */
B
Bruce Momjian 已提交
1711
						elog(WARNING, "Child itemid in update-chain marked as unused - can't continue repair_frag");
1712 1713
						chain_move_failed = true;
						break;	/* out of loop to move to chain end */
1714
					}
1715
					tp.t_datamcxt = NULL;
1716 1717 1718 1719
					tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
					tp.t_self = Ctid;
					tlen = tp.t_len = ItemIdGetLength(Citemid);
				}
1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730
				if (chain_move_failed)
				{
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					pfree(vtmove);
					break;		/* out of walk-along-page loop */
				}

				/*
				 * Check if all items in chain can be moved
				 */
B
Bruce Momjian 已提交
1731
				for (;;)
1732
				{
1733 1734 1735 1736 1737 1738 1739
					Buffer		Pbuf;
					Page		Ppage;
					ItemId		Pitemid;
					HeapTupleData Ptp;
					VTupleLinkData vtld,
							   *vtlp;

B
Bruce Momjian 已提交
1740 1741
					if (to_vacpage == NULL ||
						!enough_space(to_vacpage, tlen))
1742 1743 1744
					{
						for (i = 0; i < num_fraged_pages; i++)
						{
B
Bruce Momjian 已提交
1745
							if (enough_space(fraged_pages->pagedesc[i], tlen))
1746 1747
								break;
						}
B
Bruce Momjian 已提交
1748 1749

						if (i == num_fraged_pages)
B
Bruce Momjian 已提交
1750
						{
1751
							/* can't move item anywhere */
1752
							chain_move_failed = true;
B
Bruce Momjian 已提交
1753
							break;		/* out of check-all-items loop */
1754 1755
						}
						to_item = i;
B
Bruce Momjian 已提交
1756
						to_vacpage = fraged_pages->pagedesc[to_item];
1757
					}
B
Bruce Momjian 已提交
1758 1759 1760 1761
					to_vacpage->free -= MAXALIGN(tlen);
					if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
						to_vacpage->free -= MAXALIGN(sizeof(ItemIdData));
					(to_vacpage->offsets_used)++;
1762 1763 1764
					if (free_vtmove == 0)
					{
						free_vtmove = 1000;
1765 1766 1767 1768
						vtmove = (VTupleMove)
							repalloc(vtmove,
									 (free_vtmove + num_vtmove) *
									 sizeof(VTupleMoveData));
1769 1770
					}
					vtmove[num_vtmove].tid = tp.t_self;
B
Bruce Momjian 已提交
1771 1772
					vtmove[num_vtmove].vacpage = to_vacpage;
					if (to_vacpage->offsets_used == 1)
1773 1774 1775 1776 1777
						vtmove[num_vtmove].cleanVpd = true;
					else
						vtmove[num_vtmove].cleanVpd = false;
					free_vtmove--;
					num_vtmove++;
B
Bruce Momjian 已提交
1778

1779
					/* At beginning of chain? */
B
Bruce Momjian 已提交
1780
					if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
B
Bruce Momjian 已提交
1781 1782
						TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data),
											  OldestXmin))
1783
						break;
B
Bruce Momjian 已提交
1784

1785 1786 1787 1788 1789 1790 1791 1792 1793
					/* No, move to tuple with prior row version */
					vtld.new_tid = tp.t_self;
					vtlp = (VTupleLink)
						vac_bsearch((void *) &vtld,
									(void *) (vacrelstats->vtlinks),
									vacrelstats->num_vtlinks,
									sizeof(VTupleLinkData),
									vac_cmp_vtlinks);
					if (vtlp == NULL)
1794
					{
1795 1796 1797
						/* see discussion above */
						elog(WARNING, "Parent item in update-chain not found - can't continue repair_frag");
						chain_move_failed = true;
B
Bruce Momjian 已提交
1798
						break;	/* out of check-all-items loop */
1799 1800 1801
					}
					tp.t_self = vtlp->this_tid;
					Pbuf = ReadBuffer(onerel,
B
Bruce Momjian 已提交
1802
								ItemPointerGetBlockNumber(&(tp.t_self)));
1803 1804
					Ppage = BufferGetPage(Pbuf);
					Pitemid = PageGetItemId(Ppage,
B
Bruce Momjian 已提交
1805
							   ItemPointerGetOffsetNumber(&(tp.t_self)));
1806 1807 1808 1809 1810 1811 1812 1813 1814
					/* this can't happen since we saw tuple earlier: */
					if (!ItemIdIsUsed(Pitemid))
						elog(ERROR, "Parent itemid marked as unused");
					Ptp.t_datamcxt = NULL;
					Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);

					/* ctid should not have changed since we saved it */
					Assert(ItemPointerEquals(&(vtld.new_tid),
											 &(Ptp.t_data->t_ctid)));
B
Bruce Momjian 已提交
1815

1816
					/*
B
Bruce Momjian 已提交
1817 1818 1819 1820 1821 1822 1823 1824 1825 1826
					 * Read above about cases when !ItemIdIsUsed(Citemid)
					 * (child item is removed)... Due to the fact that at
					 * the moment we don't remove unuseful part of
					 * update-chain, it's possible to get too old parent
					 * row here. Like as in the case which caused this
					 * problem, we stop shrinking here. I could try to
					 * find real parent row but want not to do it because
					 * of real solution will be implemented anyway, later,
					 * and we are too close to 6.5 release. - vadim
					 * 06/11/99
1827 1828
					 */
					if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(Ptp.t_data),
B
Bruce Momjian 已提交
1829
									 HeapTupleHeaderGetXmin(tp.t_data))))
1830 1831 1832 1833
					{
						ReleaseBuffer(Pbuf);
						elog(WARNING, "Too old parent tuple found - can't continue repair_frag");
						chain_move_failed = true;
B
Bruce Momjian 已提交
1834
						break;	/* out of check-all-items loop */
1835
					}
1836 1837 1838 1839 1840 1841 1842
					tp.t_datamcxt = Ptp.t_datamcxt;
					tp.t_data = Ptp.t_data;
					tlen = tp.t_len = ItemIdGetLength(Pitemid);
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					Cbuf = Pbuf;
					freeCbuf = true;
B
Bruce Momjian 已提交
1843
				}				/* end of check-all-items loop */
1844

1845 1846
				if (freeCbuf)
					ReleaseBuffer(Cbuf);
1847 1848 1849
				freeCbuf = false;

				if (chain_move_failed)
1850
				{
1851
					/*
B
Bruce Momjian 已提交
1852 1853 1854
					 * Undo changes to offsets_used state.	We don't
					 * bother cleaning up the amount-free state, since
					 * we're not going to do any further tuple motion.
1855 1856 1857 1858 1859 1860
					 */
					for (i = 0; i < num_vtmove; i++)
					{
						Assert(vtmove[i].vacpage->offsets_used > 0);
						(vtmove[i].vacpage->offsets_used)--;
					}
1861
					pfree(vtmove);
1862
					break;		/* out of walk-along-page loop */
1863
				}
1864 1865 1866 1867

				/*
				 * Okay, move the whle tuple chain
				 */
1868 1869 1870
				ItemPointerSetInvalid(&Ctid);
				for (ti = 0; ti < num_vtmove; ti++)
				{
B
Bruce Momjian 已提交
1871
					VacPage		destvacpage = vtmove[ti].vacpage;
1872

V
Vadim B. Mikheev 已提交
1873
					/* Get page to move from */
1874
					tuple.t_self = vtmove[ti].tid;
B
Bruce Momjian 已提交
1875 1876
					Cbuf = ReadBuffer(onerel,
							 ItemPointerGetBlockNumber(&(tuple.t_self)));
V
Vadim B. Mikheev 已提交
1877 1878 1879 1880 1881 1882 1883 1884 1885

					/* Get page to move to */
					cur_buffer = ReadBuffer(onerel, destvacpage->blkno);

					LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
					if (cur_buffer != Cbuf)
						LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);

					ToPage = BufferGetPage(cur_buffer);
1886
					Cpage = BufferGetPage(Cbuf);
V
Vadim B. Mikheev 已提交
1887

B
Bruce Momjian 已提交
1888
					Citemid = PageGetItemId(Cpage,
1889
							ItemPointerGetOffsetNumber(&(tuple.t_self)));
1890
					tuple.t_datamcxt = NULL;
1891 1892
					tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
					tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
1893 1894 1895 1896 1897 1898 1899

					/*
					 * make a copy of the source tuple, and then mark the
					 * source tuple MOVED_OFF.
					 */
					heap_copytuple_with_tuple(&tuple, &newtup);

1900 1901 1902 1903
					/*
					 * register invalidation of source tuple in catcaches.
					 */
					CacheInvalidateHeapTuple(onerel, &tuple);
1904

1905 1906 1907
					/* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
					START_CRIT_SECTION();

1908 1909 1910
					tuple.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
												  HEAP_XMIN_INVALID |
												  HEAP_MOVED_IN);
1911
					tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1912
					HeapTupleHeaderSetXvac(tuple.t_data, myXID);
1913

1914 1915 1916
					/*
					 * If this page was not used before - clean it.
					 *
1917 1918
					 * NOTE: a nasty bug used to lurk here.  It is possible
					 * for the source and destination pages to be the same
B
Bruce Momjian 已提交
1919 1920 1921 1922 1923 1924 1925
					 * (since this tuple-chain member can be on a page
					 * lower than the one we're currently processing in
					 * the outer loop).  If that's true, then after
					 * vacuum_page() the source tuple will have been
					 * moved, and tuple.t_data will be pointing at
					 * garbage.  Therefore we must do everything that uses
					 * tuple.t_data BEFORE this step!!
1926
					 *
1927
					 * This path is different from the other callers of
B
Bruce Momjian 已提交
1928 1929
					 * vacuum_page, because we have already incremented
					 * the vacpage's offsets_used field to account for the
1930
					 * tuple(s) we expect to move onto the page. Therefore
B
Bruce Momjian 已提交
1931 1932 1933 1934
					 * vacuum_page's check for offsets_used == 0 is wrong.
					 * But since that's a good debugging check for all
					 * other callers, we work around it here rather than
					 * remove it.
1935
					 */
V
Vadim B. Mikheev 已提交
1936
					if (!PageIsEmpty(ToPage) && vtmove[ti].cleanVpd)
1937
					{
B
Bruce Momjian 已提交
1938
						int			sv_offsets_used = destvacpage->offsets_used;
1939

B
Bruce Momjian 已提交
1940
						destvacpage->offsets_used = 0;
1941
						vacuum_page(onerel, cur_buffer, destvacpage);
B
Bruce Momjian 已提交
1942
						destvacpage->offsets_used = sv_offsets_used;
1943
					}
1944 1945 1946 1947 1948

					/*
					 * Update the state of the copied tuple, and store it
					 * on the destination page.
					 */
1949 1950 1951
					newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
												   HEAP_XMIN_INVALID |
												   HEAP_MOVED_OFF);
1952
					newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1953
					HeapTupleHeaderSetXvac(newtup.t_data, myXID);
1954 1955 1956 1957 1958
					newoff = PageAddItem(ToPage,
										 (Item) newtup.t_data,
										 tuple_len,
										 InvalidOffsetNumber,
										 LP_USED);
1959 1960
					if (newoff == InvalidOffsetNumber)
					{
1961
						elog(PANIC, "moving chain: failed to add item with len = %lu to page %u",
B
Bruce Momjian 已提交
1962
						  (unsigned long) tuple_len, destvacpage->blkno);
1963 1964 1965
					}
					newitemid = PageGetItemId(ToPage, newoff);
					pfree(newtup.t_data);
1966
					newtup.t_datamcxt = NULL;
1967
					newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
B
Bruce Momjian 已提交
1968
					ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
V
Vadim B. Mikheev 已提交
1969

1970 1971
					/* XLOG stuff */
					if (!onerel->rd_istemp)
V
Vadim B. Mikheev 已提交
1972
					{
B
Bruce Momjian 已提交
1973 1974 1975
						XLogRecPtr	recptr =
						log_heap_move(onerel, Cbuf, tuple.t_self,
									  cur_buffer, &newtup);
V
Vadim B. Mikheev 已提交
1976 1977 1978 1979 1980 1981 1982 1983 1984

						if (Cbuf != cur_buffer)
						{
							PageSetLSN(Cpage, recptr);
							PageSetSUI(Cpage, ThisStartUpID);
						}
						PageSetLSN(ToPage, recptr);
						PageSetSUI(ToPage, ThisStartUpID);
					}
1985 1986
					else
					{
B
Bruce Momjian 已提交
1987 1988 1989 1990
						/*
						 * No XLOG record, but still need to flag that XID
						 * exists on disk
						 */
1991 1992 1993
						MyXactMadeTempRelUpdate = true;
					}

1994
					END_CRIT_SECTION();
V
Vadim B. Mikheev 已提交
1995

1996
					if (destvacpage->blkno > last_move_dest_block)
B
Bruce Momjian 已提交
1997
						last_move_dest_block = destvacpage->blkno;
B
Bruce Momjian 已提交
1998

1999
					/*
2000
					 * Set new tuple's t_ctid pointing to itself for last
B
Bruce Momjian 已提交
2001 2002
					 * tuple in chain, and to next tuple in chain
					 * otherwise.
2003 2004 2005 2006 2007 2008 2009 2010
					 */
					if (!ItemPointerIsValid(&Ctid))
						newtup.t_data->t_ctid = newtup.t_self;
					else
						newtup.t_data->t_ctid = Ctid;
					Ctid = newtup.t_self;

					num_moved++;
B
Bruce Momjian 已提交
2011

2012 2013 2014 2015
					/*
					 * Remember that we moved tuple from the current page
					 * (corresponding index tuple will be cleaned).
					 */
2016
					if (Cbuf == buf)
B
Bruce Momjian 已提交
2017
						vacpage->offsets[vacpage->offsets_free++] =
B
Bruce Momjian 已提交
2018
							ItemPointerGetOffsetNumber(&(tuple.t_self));
2019 2020
					else
						keep_tuples++;
2021

V
Vadim B. Mikheev 已提交
2022 2023 2024 2025
					LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
					if (cur_buffer != Cbuf)
						LockBuffer(Cbuf, BUFFER_LOCK_UNLOCK);

2026 2027
					/* Create index entries for the moved tuple */
					if (resultRelInfo->ri_NumIndices > 0)
2028
					{
2029 2030 2031
						ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
						ExecInsertIndexTuples(slot, &(newtup.t_self),
											  estate, true);
2032
					}
2033

2034
					WriteBuffer(cur_buffer);
2035
					WriteBuffer(Cbuf);
B
Bruce Momjian 已提交
2036
				}				/* end of move-the-tuple-chain loop */
2037

2038 2039
				cur_buffer = InvalidBuffer;
				pfree(vtmove);
2040
				chain_tuple_moved = true;
2041 2042

				/* advance to next tuple in walk-along-page loop */
2043
				continue;
B
Bruce Momjian 已提交
2044
			}					/* end of is-tuple-in-chain test */
2045

2046
			/* try to find new page for this tuple */
B
Bruce Momjian 已提交
2047
			if (cur_buffer == InvalidBuffer ||
B
Bruce Momjian 已提交
2048
				!enough_space(cur_page, tuple_len))
2049
			{
B
Bruce Momjian 已提交
2050
				if (cur_buffer != InvalidBuffer)
2051
				{
B
Bruce Momjian 已提交
2052 2053
					WriteBuffer(cur_buffer);
					cur_buffer = InvalidBuffer;
2054
				}
B
Bruce Momjian 已提交
2055
				for (i = 0; i < num_fraged_pages; i++)
2056
				{
B
Bruce Momjian 已提交
2057
					if (enough_space(fraged_pages->pagedesc[i], tuple_len))
2058 2059
						break;
				}
B
Bruce Momjian 已提交
2060
				if (i == num_fraged_pages)
2061
					break;		/* can't move item anywhere */
B
Bruce Momjian 已提交
2062
				cur_item = i;
B
Bruce Momjian 已提交
2063 2064
				cur_page = fraged_pages->pagedesc[cur_item];
				cur_buffer = ReadBuffer(onerel, cur_page->blkno);
V
Vadim B. Mikheev 已提交
2065
				LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
B
Bruce Momjian 已提交
2066
				ToPage = BufferGetPage(cur_buffer);
2067
				/* if this page was not used before - clean it */
B
Bruce Momjian 已提交
2068
				if (!PageIsEmpty(ToPage) && cur_page->offsets_used == 0)
2069
					vacuum_page(onerel, cur_buffer, cur_page);
2070
			}
V
Vadim B. Mikheev 已提交
2071 2072 2073 2074
			else
				LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);

			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2075 2076

			/* copy tuple */
2077
			heap_copytuple_with_tuple(&tuple, &newtup);
2078

2079 2080 2081
			/*
			 * register invalidation of source tuple in catcaches.
			 *
B
Bruce Momjian 已提交
2082 2083 2084
			 * (Note: we do not need to register the copied tuple, because we
			 * are not changing the tuple contents and so there cannot be
			 * any need to flush negative catcache entries.)
2085 2086
			 */
			CacheInvalidateHeapTuple(onerel, &tuple);
2087

2088 2089 2090
			/* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
			START_CRIT_SECTION();

B
Bruce Momjian 已提交
2091
			/*
2092
			 * Mark new tuple as MOVED_IN by me.
2093
			 */
2094 2095 2096
			newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
										   HEAP_XMIN_INVALID |
										   HEAP_MOVED_OFF);
2097
			newtup.t_data->t_infomask |= HEAP_MOVED_IN;
2098
			HeapTupleHeaderSetXvac(newtup.t_data, myXID);
2099 2100

			/* add tuple to the page */
2101
			newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
2102 2103 2104
								 InvalidOffsetNumber, LP_USED);
			if (newoff == InvalidOffsetNumber)
			{
2105
				elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
2106 2107
					 (unsigned long) tuple_len,
					 cur_page->blkno, (unsigned long) cur_page->free,
B
Bruce Momjian 已提交
2108
					 cur_page->offsets_used, cur_page->offsets_free);
2109 2110
			}
			newitemid = PageGetItemId(ToPage, newoff);
2111
			pfree(newtup.t_data);
2112
			newtup.t_datamcxt = NULL;
2113
			newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
B
Bruce Momjian 已提交
2114
			ItemPointerSet(&(newtup.t_data->t_ctid), cur_page->blkno, newoff);
2115
			newtup.t_self = newtup.t_data->t_ctid;
2116

B
Bruce Momjian 已提交
2117
			/*
2118
			 * Mark old tuple as MOVED_OFF by me.
2119
			 */
2120 2121 2122
			tuple.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
										  HEAP_XMIN_INVALID |
										  HEAP_MOVED_IN);
2123
			tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
2124
			HeapTupleHeaderSetXvac(tuple.t_data, myXID);
2125

2126 2127
			/* XLOG stuff */
			if (!onerel->rd_istemp)
V
Vadim B. Mikheev 已提交
2128
			{
B
Bruce Momjian 已提交
2129 2130 2131
				XLogRecPtr	recptr =
				log_heap_move(onerel, buf, tuple.t_self,
							  cur_buffer, &newtup);
V
Vadim B. Mikheev 已提交
2132 2133 2134 2135 2136 2137

				PageSetLSN(page, recptr);
				PageSetSUI(page, ThisStartUpID);
				PageSetLSN(ToPage, recptr);
				PageSetSUI(ToPage, ThisStartUpID);
			}
2138 2139
			else
			{
B
Bruce Momjian 已提交
2140 2141 2142 2143
				/*
				 * No XLOG record, but still need to flag that XID exists
				 * on disk
				 */
2144 2145 2146
				MyXactMadeTempRelUpdate = true;
			}

2147
			END_CRIT_SECTION();
V
Vadim B. Mikheev 已提交
2148

B
Bruce Momjian 已提交
2149
			cur_page->offsets_used++;
B
Bruce Momjian 已提交
2150
			num_moved++;
B
Bruce Momjian 已提交
2151
			cur_page->free = ((PageHeader) ToPage)->pd_upper - ((PageHeader) ToPage)->pd_lower;
2152
			if (cur_page->blkno > last_move_dest_block)
B
Bruce Momjian 已提交
2153
				last_move_dest_block = cur_page->blkno;
2154

B
Bruce Momjian 已提交
2155
			vacpage->offsets[vacpage->offsets_free++] = offnum;
2156

V
Vadim B. Mikheev 已提交
2157 2158 2159
			LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);

2160
			/* insert index' tuples if needed */
2161
			if (resultRelInfo->ri_NumIndices > 0)
2162
			{
2163 2164
				ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
				ExecInsertIndexTuples(slot, &(newtup.t_self), estate, true);
2165
			}
B
Bruce Momjian 已提交
2166
		}						/* walk along page */
2167

2168
		/*
B
Bruce Momjian 已提交
2169 2170
		 * If we broke out of the walk-along-page loop early (ie, still
		 * have offnum <= maxoff), then we failed to move some tuple off
2171 2172 2173
		 * this page.  No point in shrinking any more, so clean up and
		 * exit the per-page loop.
		 */
2174 2175
		if (offnum < maxoff && keep_tuples > 0)
		{
B
Bruce Momjian 已提交
2176
			OffsetNumber off;
2177

2178
			/*
B
Bruce Momjian 已提交
2179 2180
			 * Fix vacpage state for any unvisited tuples remaining on
			 * page
2181
			 */
2182
			for (off = OffsetNumberNext(offnum);
B
Bruce Momjian 已提交
2183 2184
				 off <= maxoff;
				 off = OffsetNumberNext(off))
2185 2186 2187 2188
			{
				itemid = PageGetItemId(page, off);
				if (!ItemIdIsUsed(itemid))
					continue;
2189
				tuple.t_datamcxt = NULL;
2190 2191 2192 2193 2194 2195 2196
				tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
				if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)
					continue;
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
					elog(ERROR, "HEAP_MOVED_IN was not expected (2)");
				if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
2197 2198
					if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
						elog(ERROR, "Invalid XVAC in tuple header (4)");
B
Bruce Momjian 已提交
2199
					/* some chains was moved while */
B
Bruce Momjian 已提交
2200 2201
					if (chain_tuple_moved)
					{			/* cleaning this page */
B
Bruce Momjian 已提交
2202 2203
						Assert(vacpage->offsets_free > 0);
						for (i = 0; i < vacpage->offsets_free; i++)
2204
						{
B
Bruce Momjian 已提交
2205
							if (vacpage->offsets[i] == off)
2206 2207
								break;
						}
B
Bruce Momjian 已提交
2208
						if (i >= vacpage->offsets_free) /* not found */
2209
						{
B
Bruce Momjian 已提交
2210
							vacpage->offsets[vacpage->offsets_free++] = off;
2211 2212 2213 2214 2215 2216
							Assert(keep_tuples > 0);
							keep_tuples--;
						}
					}
					else
					{
B
Bruce Momjian 已提交
2217
						vacpage->offsets[vacpage->offsets_free++] = off;
2218 2219 2220 2221
						Assert(keep_tuples > 0);
						keep_tuples--;
					}
				}
2222 2223
				else
					elog(ERROR, "HEAP_MOVED_OFF was expected (2)");
2224 2225 2226
			}
		}

B
Bruce Momjian 已提交
2227
		if (vacpage->offsets_free > 0)	/* some tuples were moved */
V
Vadim B. Mikheev 已提交
2228
		{
2229 2230
			if (chain_tuple_moved)		/* else - they are ordered */
			{
B
Bruce Momjian 已提交
2231
				qsort((char *) (vacpage->offsets), vacpage->offsets_free,
B
Bruce Momjian 已提交
2232
					  sizeof(OffsetNumber), vac_cmp_offno);
2233
			}
2234
			vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
2235
			WriteBuffer(buf);
V
Vadim B. Mikheev 已提交
2236
		}
2237 2238 2239 2240
		else if (dowrite)
			WriteBuffer(buf);
		else
			ReleaseBuffer(buf);
V
Vadim B. Mikheev 已提交
2241

2242
		if (offnum <= maxoff)
2243
			break;				/* had to quit early, see above note */
2244 2245 2246 2247 2248

	}							/* walk along relation */

	blkno++;					/* new number of blocks */

B
Bruce Momjian 已提交
2249
	if (cur_buffer != InvalidBuffer)
V
Vadim B. Mikheev 已提交
2250
	{
B
Bruce Momjian 已提交
2251 2252
		Assert(num_moved > 0);
		WriteBuffer(cur_buffer);
V
Vadim B. Mikheev 已提交
2253
	}
2254

B
Bruce Momjian 已提交
2255
	if (num_moved > 0)
V
Vadim B. Mikheev 已提交
2256
	{
2257
		/*
2258 2259 2260 2261
		 * We have to commit our tuple movings before we truncate the
		 * relation.  Ideally we should do Commit/StartTransactionCommand
		 * here, relying on the session-level table lock to protect our
		 * exclusive access to the relation.  However, that would require
2262
		 * a lot of extra code to close and re-open the relation, indexes,
B
Bruce Momjian 已提交
2263 2264
		 * etc.  For now, a quick hack: record status of current
		 * transaction as committed, and continue.
2265
		 */
V
Vadim B. Mikheev 已提交
2266
		RecordTransactionCommit();
V
Vadim B. Mikheev 已提交
2267
	}
2268 2269

	/*
2270 2271
	 * We are not going to move any more tuples across pages, but we still
	 * need to apply vacuum_page to compact free space in the remaining
2272 2273 2274
	 * pages in vacuum_pages list.	Note that some of these pages may also
	 * be in the fraged_pages list, and may have had tuples moved onto
	 * them; if so, we already did vacuum_page and needn't do it again.
2275
	 */
2276 2277 2278
	for (i = 0, curpage = vacuum_pages->pagedesc;
		 i < vacuumed_pages;
		 i++, curpage++)
V
Vadim B. Mikheev 已提交
2279
	{
2280
		CHECK_FOR_INTERRUPTS();
2281
		Assert((*curpage)->blkno < blkno);
2282
		if ((*curpage)->offsets_used == 0)
2283
		{
2284 2285 2286 2287
			/* this page was not used as a move target, so must clean it */
			buf = ReadBuffer(onerel, (*curpage)->blkno);
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			page = BufferGetPage(buf);
2288
			if (!PageIsEmpty(page))
2289
				vacuum_page(onerel, buf, *curpage);
2290 2291
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
			WriteBuffer(buf);
2292
		}
2293 2294 2295
	}

	/*
2296 2297 2298
	 * Now scan all the pages that we moved tuples onto and update tuple
	 * status bits.  This is not really necessary, but will save time for
	 * future transactions examining these tuples.
2299
	 *
B
Bruce Momjian 已提交
2300
	 * XXX WARNING that this code fails to clear HEAP_MOVED_OFF tuples from
2301 2302 2303 2304
	 * pages that were move source pages but not move dest pages.  One
	 * also wonders whether it wouldn't be better to skip this step and
	 * let the tuple status updates happen someplace that's not holding an
	 * exclusive lock on the relation.
2305 2306 2307 2308 2309 2310
	 */
	checked_moved = 0;
	for (i = 0, curpage = fraged_pages->pagedesc;
		 i < num_fraged_pages;
		 i++, curpage++)
	{
2311
		CHECK_FOR_INTERRUPTS();
2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324
		Assert((*curpage)->blkno < blkno);
		if ((*curpage)->blkno > last_move_dest_block)
			break;				/* no need to scan any further */
		if ((*curpage)->offsets_used == 0)
			continue;			/* this page was never used as a move dest */
		buf = ReadBuffer(onerel, (*curpage)->blkno);
		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
		page = BufferGetPage(buf);
		num_tuples = 0;
		max_offset = PageGetMaxOffsetNumber(page);
		for (newoff = FirstOffsetNumber;
			 newoff <= max_offset;
			 newoff = OffsetNumberNext(newoff))
2325
		{
2326 2327 2328 2329 2330 2331
			itemid = PageGetItemId(page, newoff);
			if (!ItemIdIsUsed(itemid))
				continue;
			tuple.t_datamcxt = NULL;
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
2332
			{
2333 2334
				if (!(tuple.t_data->t_infomask & HEAP_MOVED))
					elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
2335 2336
				if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
					elog(ERROR, "Invalid XVAC in tuple header (2)");
2337
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
2338
				{
2339
					tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
2340
					tuple.t_data->t_infomask &= ~HEAP_MOVED;
2341
					num_tuples++;
2342
				}
2343
				else
2344
					tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
2345 2346
			}
		}
2347
		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2348
		WriteBuffer(buf);
2349 2350
		Assert((*curpage)->offsets_used == num_tuples);
		checked_moved += num_tuples;
V
Vadim B. Mikheev 已提交
2351
	}
B
Bruce Momjian 已提交
2352
	Assert(num_moved == checked_moved);
2353

2354
	elog(elevel, "Rel %s: Pages: %u --> %u; Tuple(s) moved: %u.\n\t%s",
2355
		 RelationGetRelationName(onerel),
B
Bruce Momjian 已提交
2356
		 nblocks, blkno, num_moved,
2357
		 vac_show_rusage(&ru0));
2358

B
Bruce Momjian 已提交
2359
	/*
2360 2361
	 * Reflect the motion of system tuples to catalog cache here.
	 */
2362
	CommandCounterIncrement();
2363

B
Bruce Momjian 已提交
2364
	if (Nvacpagelist.num_pages > 0)
V
Vadim B. Mikheev 已提交
2365
	{
2366
		/* vacuum indexes again if needed */
2367 2368
		if (Irel != (Relation *) NULL)
		{
B
Bruce Momjian 已提交
2369
			VacPage    *vpleft,
2370 2371
					   *vpright,
						vpsave;
2372

B
Bruce Momjian 已提交
2373 2374
			/* re-sort Nvacpagelist.pagedesc */
			for (vpleft = Nvacpagelist.pagedesc,
B
Bruce Momjian 已提交
2375
			vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
2376 2377 2378 2379 2380 2381
				 vpleft < vpright; vpleft++, vpright--)
			{
				vpsave = *vpleft;
				*vpleft = *vpright;
				*vpright = vpsave;
			}
2382
			Assert(keep_tuples >= 0);
2383
			for (i = 0; i < nindexes; i++)
B
Bruce Momjian 已提交
2384
				vacuum_index(&Nvacpagelist, Irel[i],
2385
							 vacrelstats->rel_tuples, keep_tuples);
2386 2387
		}

B
Bruce Momjian 已提交
2388
		/* clean moved tuples from last page in Nvacpagelist list */
2389
		if (vacpage->blkno == (blkno - 1) &&
B
Bruce Momjian 已提交
2390
			vacpage->offsets_free > 0)
2391
		{
2392
			OffsetNumber unbuf[BLCKSZ / sizeof(OffsetNumber)];
2393
			OffsetNumber *unused = unbuf;
B
Bruce Momjian 已提交
2394
			int			uncnt;
2395

B
Bruce Momjian 已提交
2396
			buf = ReadBuffer(onerel, vacpage->blkno);
2397
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2398
			page = BufferGetPage(buf);
B
Bruce Momjian 已提交
2399
			num_tuples = 0;
2400
			maxoff = PageGetMaxOffsetNumber(page);
2401
			for (offnum = FirstOffsetNumber;
2402
				 offnum <= maxoff;
2403 2404 2405 2406 2407
				 offnum = OffsetNumberNext(offnum))
			{
				itemid = PageGetItemId(page, offnum);
				if (!ItemIdIsUsed(itemid))
					continue;
2408
				tuple.t_datamcxt = NULL;
2409
				tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2410 2411 2412 2413 2414

				if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
				{
					if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
					{
2415 2416
						if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
							elog(ERROR, "Invalid XVAC in tuple header (3)");
2417 2418 2419 2420
						itemid->lp_flags &= ~LP_USED;
						num_tuples++;
					}
					else
2421
						elog(ERROR, "HEAP_MOVED_OFF was expected (3)");
2422 2423
				}

2424
			}
B
Bruce Momjian 已提交
2425
			Assert(vacpage->offsets_free == num_tuples);
2426

2427
			START_CRIT_SECTION();
2428

2429
			uncnt = PageRepairFragmentation(page, unused);
2430 2431 2432

			/* XLOG stuff */
			if (!onerel->rd_istemp)
2433
			{
2434
				XLogRecPtr	recptr;
B
Bruce Momjian 已提交
2435 2436 2437

				recptr = log_heap_clean(onerel, buf, (char *) unused,
						  (char *) (&(unused[uncnt])) - (char *) unused);
2438 2439 2440
				PageSetLSN(page, recptr);
				PageSetSUI(page, ThisStartUpID);
			}
2441 2442
			else
			{
B
Bruce Momjian 已提交
2443 2444 2445 2446
				/*
				 * No XLOG record, but still need to flag that XID exists
				 * on disk
				 */
2447 2448 2449
				MyXactMadeTempRelUpdate = true;
			}

2450
			END_CRIT_SECTION();
2451

2452
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2453 2454 2455
			WriteBuffer(buf);
		}

B
Bruce Momjian 已提交
2456
		/* now - free new list of reaped pages */
B
Bruce Momjian 已提交
2457 2458 2459 2460
		curpage = Nvacpagelist.pagedesc;
		for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
			pfree(*curpage);
		pfree(Nvacpagelist.pagedesc);
V
Vadim B. Mikheev 已提交
2461 2462
	}

2463 2464
	/*
	 * Flush dirty pages out to disk.  We do this unconditionally, even if
B
Bruce Momjian 已提交
2465 2466 2467
	 * we don't need to truncate, because we want to ensure that all
	 * tuples have correct on-row commit status on disk (see bufmgr.c's
	 * comments for FlushRelationBuffers()).
2468 2469 2470 2471 2472 2473 2474
	 */
	i = FlushRelationBuffers(onerel, blkno);
	if (i < 0)
		elog(ERROR, "VACUUM (repair_frag): FlushRelationBuffers returned %d",
			 i);

	/* truncate relation, if needed */
2475
	if (blkno < nblocks)
V
Vadim B. Mikheev 已提交
2476
	{
B
Bruce Momjian 已提交
2477
		blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
2478
		onerel->rd_nblocks = blkno;		/* update relcache immediately */
2479
		onerel->rd_targblock = InvalidBlockNumber;
2480
		vacrelstats->rel_pages = blkno; /* set new number of blocks */
2481 2482
	}

2483
	/* clean up */
B
Bruce Momjian 已提交
2484
	pfree(vacpage);
2485 2486
	if (vacrelstats->vtlinks != NULL)
		pfree(vacrelstats->vtlinks);
2487 2488 2489 2490

	ExecDropTupleTable(tupleTable, true);

	ExecCloseIndices(resultRelInfo);
B
Bruce Momjian 已提交
2491
}
V
Vadim B. Mikheev 已提交
2492 2493

/*
B
Bruce Momjian 已提交
2494
 *	vacuum_heap() -- free dead tuples
V
Vadim B. Mikheev 已提交
2495
 *
2496 2497
 *		This routine marks dead tuples as unused and truncates relation
 *		if there are "empty" end-blocks.
V
Vadim B. Mikheev 已提交
2498 2499
 */
static void
B
Bruce Momjian 已提交
2500
vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
V
Vadim B. Mikheev 已提交
2501
{
2502
	Buffer		buf;
B
Bruce Momjian 已提交
2503
	VacPage    *vacpage;
2504
	BlockNumber relblocks;
2505
	int			nblocks;
2506
	int			i;
2507

B
Bruce Momjian 已提交
2508
	nblocks = vacuum_pages->num_pages;
B
Bruce Momjian 已提交
2509
	nblocks -= vacuum_pages->empty_end_pages;	/* nothing to do with them */
2510

B
Bruce Momjian 已提交
2511
	for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
2512
	{
2513
		CHECK_FOR_INTERRUPTS();
B
Bruce Momjian 已提交
2514
		if ((*vacpage)->offsets_free > 0)
2515
		{
B
Bruce Momjian 已提交
2516
			buf = ReadBuffer(onerel, (*vacpage)->blkno);
2517 2518 2519
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			vacuum_page(onerel, buf, *vacpage);
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2520 2521
			WriteBuffer(buf);
		}
V
Vadim B. Mikheev 已提交
2522 2523
	}

2524 2525
	/*
	 * Flush dirty pages out to disk.  We do this unconditionally, even if
B
Bruce Momjian 已提交
2526 2527 2528
	 * we don't need to truncate, because we want to ensure that all
	 * tuples have correct on-row commit status on disk (see bufmgr.c's
	 * comments for FlushRelationBuffers()).
2529
	 */
2530
	Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
2531
	relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
2532

2533
	i = FlushRelationBuffers(onerel, relblocks);
2534 2535 2536 2537
	if (i < 0)
		elog(ERROR, "VACUUM (vacuum_heap): FlushRelationBuffers returned %d",
			 i);

2538
	/* truncate relation if there are some empty end-pages */
B
Bruce Momjian 已提交
2539
	if (vacuum_pages->empty_end_pages > 0)
2540
	{
2541
		elog(elevel, "Rel %s: Pages: %u --> %u.",
2542
			 RelationGetRelationName(onerel),
2543 2544
			 vacrelstats->rel_pages, relblocks);
		relblocks = smgrtruncate(DEFAULT_SMGR, onerel, relblocks);
2545
		onerel->rd_nblocks = relblocks; /* update relcache immediately */
2546
		onerel->rd_targblock = InvalidBlockNumber;
2547
		vacrelstats->rel_pages = relblocks;		/* set new number of
B
Bruce Momjian 已提交
2548
												 * blocks */
2549
	}
B
Bruce Momjian 已提交
2550
}
V
Vadim B. Mikheev 已提交
2551 2552

/*
B
Bruce Momjian 已提交
2553
 *	vacuum_page() -- free dead tuples on a page
2554
 *					 and repair its fragmentation.
V
Vadim B. Mikheev 已提交
2555 2556
 */
static void
2557
vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
V
Vadim B. Mikheev 已提交
2558
{
2559
	OffsetNumber unbuf[BLCKSZ / sizeof(OffsetNumber)];
2560
	OffsetNumber *unused = unbuf;
B
Bruce Momjian 已提交
2561 2562 2563 2564
	int			uncnt;
	Page		page = BufferGetPage(buffer);
	ItemId		itemid;
	int			i;
2565

2566
	/* There shouldn't be any tuples moved onto the page yet! */
B
Bruce Momjian 已提交
2567
	Assert(vacpage->offsets_used == 0);
2568

2569
	START_CRIT_SECTION();
2570

B
Bruce Momjian 已提交
2571
	for (i = 0; i < vacpage->offsets_free; i++)
V
Vadim B. Mikheev 已提交
2572
	{
2573
		itemid = PageGetItemId(page, vacpage->offsets[i]);
2574
		itemid->lp_flags &= ~LP_USED;
V
Vadim B. Mikheev 已提交
2575
	}
2576

2577
	uncnt = PageRepairFragmentation(page, unused);
2578 2579 2580

	/* XLOG stuff */
	if (!onerel->rd_istemp)
2581
	{
2582
		XLogRecPtr	recptr;
B
Bruce Momjian 已提交
2583 2584 2585

		recptr = log_heap_clean(onerel, buffer, (char *) unused,
						  (char *) (&(unused[uncnt])) - (char *) unused);
2586 2587 2588
		PageSetLSN(page, recptr);
		PageSetSUI(page, ThisStartUpID);
	}
2589 2590 2591 2592 2593 2594
	else
	{
		/* No XLOG record, but still need to flag that XID exists on disk */
		MyXactMadeTempRelUpdate = true;
	}

2595
	END_CRIT_SECTION();
B
Bruce Momjian 已提交
2596
}
2597

2598
/*
2599
 *	scan_index() -- scan one index relation to update statistic.
2600 2601
 *
 * We use this when we have no deletions to do.
2602 2603
 */
static void
2604
scan_index(Relation indrel, double num_tuples)
2605
{
2606
	IndexBulkDeleteResult *stats;
2607
	VacRUsage	ru0;
2608

2609
	vac_init_rusage(&ru0);
2610

2611 2612
	/*
	 * Even though we're not planning to delete anything, use the
2613 2614
	 * ambulkdelete call, so that the scan happens within the index AM for
	 * more speed.
2615 2616
	 */
	stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
2617

2618 2619
	if (!stats)
		return;
2620

2621
	/* now update statistics in pg_class */
2622 2623 2624
	vac_update_relstats(RelationGetRelid(indrel),
						stats->num_pages, stats->num_index_tuples,
						false);
2625

2626
	elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s",
2627 2628
		 RelationGetRelationName(indrel),
		 stats->num_pages, stats->num_index_tuples,
2629
		 vac_show_rusage(&ru0));
2630

2631
	/*
2632 2633
	 * Check for tuple count mismatch.	If the index is partial, then it's
	 * OK for it to have fewer tuples than the heap; else we got trouble.
2634
	 */
2635
	if (stats->num_index_tuples != num_tuples)
2636
	{
2637
		if (stats->num_index_tuples > num_tuples ||
2638
			!vac_is_partial_index(indrel))
2639 2640
			elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f)."
				 "\n\tRecreate the index.",
2641 2642
				 RelationGetRelationName(indrel),
				 stats->num_index_tuples, num_tuples);
2643
	}
2644 2645

	pfree(stats);
B
Bruce Momjian 已提交
2646
}
2647

2648
/*
B
Bruce Momjian 已提交
2649
 *	vacuum_index() -- vacuum one index relation.
2650
 *
B
Bruce Momjian 已提交
2651
 *		Vpl is the VacPageList of the heap we're currently vacuuming.
2652
 *		It's locked. Indrel is an index relation on the vacuumed heap.
2653 2654 2655
 *
 *		We don't bother to set locks on the index relation here, since
 *		the parent table is exclusive-locked already.
2656
 *
2657 2658
 *		Finally, we arrange to update the index relation's statistics in
 *		pg_class.
2659 2660
 */
static void
2661
vacuum_index(VacPageList vacpagelist, Relation indrel,
2662
			 double num_tuples, int keep_tuples)
2663
{
2664
	IndexBulkDeleteResult *stats;
2665
	VacRUsage	ru0;
2666

2667
	vac_init_rusage(&ru0);
2668

2669 2670
	/* Do bulk deletion */
	stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist);
2671

2672 2673
	if (!stats)
		return;
2674

2675
	/* now update statistics in pg_class */
2676
	vac_update_relstats(RelationGetRelid(indrel),
2677 2678
						stats->num_pages, stats->num_index_tuples,
						false);
2679

2680
	elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s",
2681 2682
		 RelationGetRelationName(indrel), stats->num_pages,
		 stats->num_index_tuples - keep_tuples, stats->tuples_removed,
2683
		 vac_show_rusage(&ru0));
2684

2685
	/*
2686 2687
	 * Check for tuple count mismatch.	If the index is partial, then it's
	 * OK for it to have fewer tuples than the heap; else we got trouble.
2688
	 */
2689
	if (stats->num_index_tuples != num_tuples + keep_tuples)
2690
	{
2691
		if (stats->num_index_tuples > num_tuples + keep_tuples ||
2692
			!vac_is_partial_index(indrel))
2693 2694
			elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f)."
				 "\n\tRecreate the index.",
2695 2696
				 RelationGetRelationName(indrel),
				 stats->num_index_tuples, num_tuples);
2697
	}
2698 2699

	pfree(stats);
B
Bruce Momjian 已提交
2700
}
V
Vadim B. Mikheev 已提交
2701 2702

/*
B
Bruce Momjian 已提交
2703
 *	tid_reaped() -- is a particular tid reaped?
V
Vadim B. Mikheev 已提交
2704
 *
2705 2706
 *		This has the right signature to be an IndexBulkDeleteCallback.
 *
B
Bruce Momjian 已提交
2707
 *		vacpagelist->VacPage_array is sorted in right order.
V
Vadim B. Mikheev 已提交
2708
 */
2709 2710
static bool
tid_reaped(ItemPointer itemptr, void *state)
V
Vadim B. Mikheev 已提交
2711
{
2712
	VacPageList vacpagelist = (VacPageList) state;
2713 2714
	OffsetNumber ioffno;
	OffsetNumber *voff;
B
Bruce Momjian 已提交
2715
	VacPage		vp,
2716
			   *vpp;
B
Bruce Momjian 已提交
2717
	VacPageData vacpage;
V
Vadim B. Mikheev 已提交
2718

B
Bruce Momjian 已提交
2719
	vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
2720
	ioffno = ItemPointerGetOffsetNumber(itemptr);
V
Vadim B. Mikheev 已提交
2721

B
Bruce Momjian 已提交
2722
	vp = &vacpage;
2723 2724 2725 2726
	vpp = (VacPage *) vac_bsearch((void *) &vp,
								  (void *) (vacpagelist->pagedesc),
								  vacpagelist->num_pages,
								  sizeof(VacPage),
B
Bruce Momjian 已提交
2727
								  vac_cmp_blk);
V
Vadim B. Mikheev 已提交
2728

2729 2730
	if (vpp == NULL)
		return false;
2731

2732 2733
	/* ok - we are on a partially or fully reaped page */
	vp = *vpp;
2734

B
Bruce Momjian 已提交
2735
	if (vp->offsets_free == 0)
2736 2737
	{
		/* this is EmptyPage, so claim all tuples on it are reaped!!! */
2738
		return true;
2739 2740
	}

2741 2742 2743 2744
	voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
										(void *) (vp->offsets),
										vp->offsets_free,
										sizeof(OffsetNumber),
B
Bruce Momjian 已提交
2745
										vac_cmp_offno);
2746

2747 2748
	if (voff == NULL)
		return false;
2749

2750
	/* tid is reaped */
2751
	return true;
B
Bruce Momjian 已提交
2752
}
2753

2754 2755 2756 2757 2758 2759 2760 2761 2762
/*
 * Dummy version for scan_index.
 */
static bool
dummy_tid_reaped(ItemPointer itemptr, void *state)
{
	return false;
}

2763 2764 2765 2766 2767 2768 2769 2770 2771 2772
/*
 * Update the shared Free Space Map with the info we now have about
 * free space in the relation, discarding any old info the map may have.
 */
static void
vac_update_fsm(Relation onerel, VacPageList fraged_pages,
			   BlockNumber rel_pages)
{
	int			nPages = fraged_pages->num_pages;
	int			i;
2773
	PageFreeSpaceInfo *pageSpaces;
2774 2775

	/* +1 to avoid palloc(0) */
2776 2777
	pageSpaces = (PageFreeSpaceInfo *)
		palloc((nPages + 1) * sizeof(PageFreeSpaceInfo));
2778 2779 2780

	for (i = 0; i < nPages; i++)
	{
2781 2782
		pageSpaces[i].blkno = fraged_pages->pagedesc[i]->blkno;
		pageSpaces[i].avail = fraged_pages->pagedesc[i]->free;
2783

2784
		/*
2785 2786
		 * fraged_pages may contain entries for pages that we later
		 * decided to truncate from the relation; don't enter them into
2787
		 * the free space map!
2788
		 */
2789
		if (pageSpaces[i].blkno >= rel_pages)
2790 2791 2792 2793 2794 2795
		{
			nPages = i;
			break;
		}
	}

2796 2797 2798
	MultiRecordFreeSpace(&onerel->rd_node, 0, nPages, pageSpaces);

	pfree(pageSpaces);
2799 2800
}

2801 2802 2803
/* Copy a VacPage structure */
static VacPage
copy_vac_page(VacPage vacpage)
2804
{
B
Bruce Momjian 已提交
2805
	VacPage		newvacpage;
2806

B
Bruce Momjian 已提交
2807
	/* allocate a VacPageData entry */
2808
	newvacpage = (VacPage) palloc(sizeof(VacPageData) +
2809
						   vacpage->offsets_free * sizeof(OffsetNumber));
2810

2811
	/* fill it in */
B
Bruce Momjian 已提交
2812
	if (vacpage->offsets_free > 0)
2813 2814
		memcpy(newvacpage->offsets, vacpage->offsets,
			   vacpage->offsets_free * sizeof(OffsetNumber));
B
Bruce Momjian 已提交
2815 2816 2817 2818
	newvacpage->blkno = vacpage->blkno;
	newvacpage->free = vacpage->free;
	newvacpage->offsets_used = vacpage->offsets_used;
	newvacpage->offsets_free = vacpage->offsets_free;
2819

2820
	return newvacpage;
B
Bruce Momjian 已提交
2821
}
V
Vadim B. Mikheev 已提交
2822

2823 2824 2825 2826 2827 2828 2829
/*
 * Add a VacPage pointer to a VacPageList.
 *
 *		As a side effect of the way that scan_heap works,
 *		higher pages come after lower pages in the array
 *		(and highest tid on a page is last).
 */
B
Bruce Momjian 已提交
2830 2831
static void
vpage_insert(VacPageList vacpagelist, VacPage vpnew)
V
Vadim B. Mikheev 已提交
2832
{
T
Tatsuo Ishii 已提交
2833
#define PG_NPAGEDESC 1024
V
Vadim B. Mikheev 已提交
2834

B
Bruce Momjian 已提交
2835 2836
	/* allocate a VacPage entry if needed */
	if (vacpagelist->num_pages == 0)
T
Tatsuo Ishii 已提交
2837
	{
B
Bruce Momjian 已提交
2838 2839
		vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
		vacpagelist->num_allocated_pages = PG_NPAGEDESC;
T
Tatsuo Ishii 已提交
2840
	}
B
Bruce Momjian 已提交
2841
	else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
T
Tatsuo Ishii 已提交
2842
	{
B
Bruce Momjian 已提交
2843 2844
		vacpagelist->num_allocated_pages *= 2;
		vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
T
Tatsuo Ishii 已提交
2845
	}
B
Bruce Momjian 已提交
2846 2847
	vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
	(vacpagelist->num_pages)++;
2848 2849
}

2850 2851 2852
/*
 * vac_bsearch: just like standard C library routine bsearch(),
 * except that we first test to see whether the target key is outside
2853
 * the range of the table entries.	This case is handled relatively slowly
2854 2855 2856
 * by the normal binary search algorithm (ie, no faster than any other key)
 * but it occurs often enough in VACUUM to be worth optimizing.
 */
2857
static void *
2858 2859
vac_bsearch(const void *key, const void *base,
			size_t nelem, size_t size,
B
Bruce Momjian 已提交
2860
			int (*compar) (const void *, const void *))
2861
{
2862
	int			res;
2863 2864 2865 2866 2867 2868 2869 2870 2871 2872
	const void *last;

	if (nelem == 0)
		return NULL;
	res = compar(key, base);
	if (res < 0)
		return NULL;
	if (res == 0)
		return (void *) base;
	if (nelem > 1)
2873
	{
2874 2875 2876
		last = (const void *) ((const char *) base + (nelem - 1) * size);
		res = compar(key, last);
		if (res > 0)
2877
			return NULL;
2878 2879
		if (res == 0)
			return (void *) last;
2880
	}
2881 2882 2883
	if (nelem <= 2)
		return NULL;			/* already checked 'em all */
	return bsearch(key, base, nelem, size, compar);
B
Bruce Momjian 已提交
2884
}
2885

2886 2887 2888
/*
 * Comparator routines for use with qsort() and bsearch().
 */
2889
static int
B
Bruce Momjian 已提交
2890
vac_cmp_blk(const void *left, const void *right)
2891
{
2892 2893
	BlockNumber lblk,
				rblk;
2894

B
Bruce Momjian 已提交
2895 2896
	lblk = (*((VacPage *) left))->blkno;
	rblk = (*((VacPage *) right))->blkno;
2897

2898
	if (lblk < rblk)
2899
		return -1;
2900
	if (lblk == rblk)
2901 2902
		return 0;
	return 1;
B
Bruce Momjian 已提交
2903
}
2904

2905
static int
B
Bruce Momjian 已提交
2906
vac_cmp_offno(const void *left, const void *right)
2907
{
2908
	if (*(OffsetNumber *) left < *(OffsetNumber *) right)
2909
		return -1;
2910
	if (*(OffsetNumber *) left == *(OffsetNumber *) right)
2911 2912
		return 0;
	return 1;
B
Bruce Momjian 已提交
2913
}
V
Vadim B. Mikheev 已提交
2914

2915
static int
B
Bruce Momjian 已提交
2916
vac_cmp_vtlinks(const void *left, const void *right)
2917
{
B
Bruce Momjian 已提交
2918 2919
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2920
		return -1;
B
Bruce Momjian 已提交
2921 2922
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2923 2924
		return 1;
	/* bi_hi-es are equal */
B
Bruce Momjian 已提交
2925 2926
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2927
		return -1;
B
Bruce Momjian 已提交
2928 2929
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2930 2931
		return 1;
	/* bi_lo-es are equal */
B
Bruce Momjian 已提交
2932 2933
	if (((VTupleLink) left)->new_tid.ip_posid <
		((VTupleLink) right)->new_tid.ip_posid)
2934
		return -1;
B
Bruce Momjian 已提交
2935 2936
	if (((VTupleLink) left)->new_tid.ip_posid >
		((VTupleLink) right)->new_tid.ip_posid)
2937 2938 2939
		return 1;
	return 0;
}
V
Vadim B. Mikheev 已提交
2940

2941

2942 2943
void
vac_open_indexes(Relation relation, int *nindexes, Relation **Irel)
V
Vadim B. Mikheev 已提交
2944
{
2945 2946 2947
	List	   *indexoidlist,
			   *indexoidscan;
	int			i;
V
Vadim B. Mikheev 已提交
2948

2949
	indexoidlist = RelationGetIndexList(relation);
2950

2951
	*nindexes = length(indexoidlist);
2952

2953 2954
	if (*nindexes > 0)
		*Irel = (Relation *) palloc(*nindexes * sizeof(Relation));
2955 2956
	else
		*Irel = NULL;
2957

2958 2959
	i = 0;
	foreach(indexoidscan, indexoidlist)
2960
	{
2961
		Oid			indexoid = lfirsti(indexoidscan);
V
Vadim B. Mikheev 已提交
2962

2963 2964
		(*Irel)[i] = index_open(indexoid);
		i++;
2965 2966
	}

2967
	freeList(indexoidlist);
B
Bruce Momjian 已提交
2968
}
V
Vadim B. Mikheev 已提交
2969 2970


2971 2972
void
vac_close_indexes(int nindexes, Relation *Irel)
V
Vadim B. Mikheev 已提交
2973
{
2974 2975
	if (Irel == (Relation *) NULL)
		return;
V
Vadim B. Mikheev 已提交
2976

2977 2978
	while (nindexes--)
		index_close(Irel[nindexes]);
2979
	pfree(Irel);
B
Bruce Momjian 已提交
2980
}
V
Vadim B. Mikheev 已提交
2981 2982


2983 2984 2985 2986 2987
/*
 * Is an index partial (ie, could it contain fewer tuples than the heap?)
 */
bool
vac_is_partial_index(Relation indrel)
V
Vadim B. Mikheev 已提交
2988
{
2989
	/*
2990 2991
	 * If the index's AM doesn't support nulls, it's partial for our
	 * purposes
2992
	 */
2993
	if (!indrel->rd_am->amindexnulls)
2994 2995 2996
		return true;

	/* Otherwise, look to see if there's a partial-index predicate */
2997
	return (VARSIZE(&indrel->rd_index->indpred) > VARHDRSZ);
B
Bruce Momjian 已提交
2998
}
2999 3000


3001
static bool
B
Bruce Momjian 已提交
3002
enough_space(VacPage vacpage, Size len)
V
Vadim B. Mikheev 已提交
3003
{
3004
	len = MAXALIGN(len);
3005

B
Bruce Momjian 已提交
3006
	if (len > vacpage->free)
3007
		return false;
3008

3009 3010 3011
	/* if there are free itemid(s) and len <= free_space... */
	if (vacpage->offsets_used < vacpage->offsets_free)
		return true;
3012

3013 3014
	/* noff_used >= noff_free and so we'll have to allocate new itemid */
	if (len + sizeof(ItemIdData) <= vacpage->free)
3015
		return true;
3016

3017
	return false;
B
Bruce Momjian 已提交
3018
}
3019 3020


3021 3022 3023
/*
 * Initialize usage snapshot.
 */
3024 3025
void
vac_init_rusage(VacRUsage *ru0)
3026 3027 3028 3029 3030 3031 3032
{
	struct timezone tz;

	getrusage(RUSAGE_SELF, &ru0->ru);
	gettimeofday(&ru0->tv, &tz);
}

3033 3034 3035 3036 3037 3038
/*
 * Compute elapsed time since ru0 usage snapshot, and format into
 * a displayable string.  Result is in a static string, which is
 * tacky, but no one ever claimed that the Postgres backend is
 * threadable...
 */
3039 3040
const char *
vac_show_rusage(VacRUsage *ru0)
3041
{
3042 3043
	static char result[100];
	VacRUsage	ru1;
3044

3045
	vac_init_rusage(&ru1);
3046

3047 3048 3049 3050 3051 3052
	if (ru1.tv.tv_usec < ru0->tv.tv_usec)
	{
		ru1.tv.tv_sec--;
		ru1.tv.tv_usec += 1000000;
	}
	if (ru1.ru.ru_stime.tv_usec < ru0->ru.ru_stime.tv_usec)
3053
	{
3054 3055
		ru1.ru.ru_stime.tv_sec--;
		ru1.ru.ru_stime.tv_usec += 1000000;
3056
	}
3057
	if (ru1.ru.ru_utime.tv_usec < ru0->ru.ru_utime.tv_usec)
3058
	{
3059 3060
		ru1.ru.ru_utime.tv_sec--;
		ru1.ru.ru_utime.tv_usec += 1000000;
3061 3062 3063
	}

	snprintf(result, sizeof(result),
3064 3065
			 "CPU %d.%02ds/%d.%02du sec elapsed %d.%02d sec.",
			 (int) (ru1.ru.ru_stime.tv_sec - ru0->ru.ru_stime.tv_sec),
3066
	  (int) (ru1.ru.ru_stime.tv_usec - ru0->ru.ru_stime.tv_usec) / 10000,
3067
			 (int) (ru1.ru.ru_utime.tv_sec - ru0->ru.ru_utime.tv_sec),
3068
	  (int) (ru1.ru.ru_utime.tv_usec - ru0->ru.ru_utime.tv_usec) / 10000,
3069 3070
			 (int) (ru1.tv.tv_sec - ru0->tv.tv_sec),
			 (int) (ru1.tv.tv_usec - ru0->tv.tv_usec) / 10000);
3071 3072 3073

	return result;
}