vacuumlazy.c 29.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*-------------------------------------------------------------------------
 *
 * vacuumlazy.c
 *	  Concurrent ("lazy") vacuuming.
 *
 *
 * The major space usage for LAZY VACUUM is storage for the array of dead
 * tuple TIDs, with the next biggest need being storage for per-disk-page
 * free space info.  We want to ensure we can vacuum even the very largest
 * relations with finite memory space usage.  To do that, we set upper bounds
 * on the number of tuples and pages we will keep track of at once.
 *
13
 * We are willing to use at most maintenance_work_mem memory space to keep
14 15
 * track of dead tuples.  We initially allocate an array of TIDs of that size,
 * with an upper limit that depends on table size (this limit ensures we don't
B
Bruce Momjian 已提交
16
 * allocate a huge area uselessly for vacuuming small tables).	If the array
17 18 19
 * threatens to overflow, we suspend the heap scan phase and perform a pass of
 * index cleanup and page compaction, then resume the heap scan with an empty
 * TID array.
20
 *
21 22 23 24 25
 * If we're processing a table with no indexes, we can just vacuum each page
 * as we go; there's no need to save up multiple tuples to minimize the number
 * of index scans performed.  So we don't use maintenance_work_mem memory for
 * the TID array, just enough to hold as many heap tuples as fit on one page.
 *
26
 *
27
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
28 29 30 31
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
32
 *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.109 2008/10/31 15:05:00 heikki Exp $
33 34 35 36 37
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

38 39
#include <math.h>

40 41
#include "access/genam.h"
#include "access/heapam.h"
42
#include "access/transam.h"
43
#include "commands/dbcommands.h"
44 45
#include "commands/vacuum.h"
#include "miscadmin.h"
46
#include "pgstat.h"
47
#include "postmaster/autovacuum.h"
48
#include "storage/bufmgr.h"
49
#include "storage/freespace.h"
50
#include "storage/lmgr.h"
51
#include "utils/lsyscache.h"
52
#include "utils/memutils.h"
53
#include "utils/pg_rusage.h"
54
#include "utils/tqual.h"
55 56 57 58 59 60


/*
 * Space/time tradeoff parameters: do these need to be user-tunable?
 *
 * To consider truncating the relation, we want there to be at least
61 62
 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
 * is less) potentially-freeable pages.
63
 */
64
#define REL_TRUNCATE_MINIMUM	1000
65 66
#define REL_TRUNCATE_FRACTION	16

67 68 69 70 71
/*
 * Guesstimation of number of dead tuples per page.  This is used to
 * provide an upper limit to memory allocated when vacuuming small
 * tables.
 */
72
#define LAZY_ALLOC_TUPLES		MaxHeapTuplesPerPage
73 74 75

typedef struct LVRelStats
{
76 77
	/* hasindex = true means two-pass strategy; false means one-pass */
	bool		hasindex;
78
	/* Overall statistics about rel */
79
	BlockNumber rel_pages;
80
	double		rel_tuples;
B
Bruce Momjian 已提交
81
	BlockNumber pages_removed;
82
	double		tuples_deleted;
83
	BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
84 85
	/* List of TIDs of tuples we intend to delete */
	/* NB: this list is ordered by TID address */
86 87
	int			num_dead_tuples;	/* current # of entries */
	int			max_dead_tuples;	/* # slots allocated in array */
88
	ItemPointer dead_tuples;	/* array of ItemPointerData */
89
	int			num_index_scans;
90 91 92
} LVRelStats;


93
/* A few variables that don't seem worth passing around as parameters */
B
Bruce Momjian 已提交
94
static int	elevel = -1;
95

96 97 98
static TransactionId OldestXmin;
static TransactionId FreezeLimit;

99 100
static BufferAccessStrategy vac_strategy;

101 102 103

/* non-export function prototypes */
static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
104
			   Relation *Irel, int nindexes);
105
static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
106
static void lazy_vacuum_index(Relation indrel,
B
Bruce Momjian 已提交
107 108
				  IndexBulkDeleteResult **stats,
				  LVRelStats *vacrelstats);
109
static void lazy_cleanup_index(Relation indrel,
B
Bruce Momjian 已提交
110 111
				   IndexBulkDeleteResult *stats,
				   LVRelStats *vacrelstats);
112 113
static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
				 int tupindex, LVRelStats *vacrelstats);
114
static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
115
static BlockNumber count_nondeletable_pages(Relation onerel,
116
						 LVRelStats *vacrelstats);
117
static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
118
static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
119
					   ItemPointer itemptr);
120
static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
121 122 123 124 125 126 127
static int	vac_cmp_itemptr(const void *left, const void *right);


/*
 *	lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
 *
 *		This routine vacuums a single heap, cleans out its indexes, and
128
 *		updates its relpages and reltuples statistics.
129 130 131 132 133
 *
 *		At entry, we have already established a transaction and opened
 *		and locked the relation.
 */
void
134 135
lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
				BufferAccessStrategy bstrategy)
136 137 138 139
{
	LVRelStats *vacrelstats;
	Relation   *Irel;
	int			nindexes;
140
	BlockNumber possibly_freeable;
141
	PGRUsage	ru0;
B
Bruce Momjian 已提交
142
	TimestampTz starttime = 0;
143 144 145 146

	pg_rusage_init(&ru0);

	/* measure elapsed time iff autovacuum logging requires it */
147
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0)
148
		starttime = GetCurrentTimestamp();
149 150

	if (vacstmt->verbose)
151
		elevel = INFO;
152
	else
153
		elevel = DEBUG2;
B
Bruce Momjian 已提交
154

155 156
	vac_strategy = bstrategy;

157
	vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
158
						  &OldestXmin, &FreezeLimit);
159

160
	vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
161

162 163
	vacrelstats->num_index_scans = 0;

164
	/* Open all indexes of the relation */
165
	vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
166
	vacrelstats->hasindex = (nindexes > 0);
167 168

	/* Do the vacuuming */
169
	lazy_scan_heap(onerel, vacrelstats, Irel, nindexes);
170 171

	/* Done with indexes */
172
	vac_close_indexes(nindexes, Irel, NoLock);
173 174 175 176

	/*
	 * Optionally truncate the relation.
	 *
177 178
	 * Don't even think about it unless we have a shot at releasing a goodly
	 * number of pages.  Otherwise, the time taken isn't worth it.
179 180
	 */
	possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
181
	if (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
B
Bruce Momjian 已提交
182
		possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)
183
		lazy_truncate_heap(onerel, vacrelstats);
184

185 186
	/* Vacuum the Free Space Map */
	FreeSpaceMapVacuum(onerel);
187

188
	/* Update statistics in pg_class */
189 190 191
	vac_update_relstats(RelationGetRelid(onerel),
						vacrelstats->rel_pages,
						vacrelstats->rel_tuples,
192
						vacrelstats->hasindex,
193
						FreezeLimit);
194 195

	/* report results to the stats collector, too */
196
	pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
B
Bruce Momjian 已提交
197
						 vacstmt->analyze, vacrelstats->rel_tuples);
198 199

	/* and log the action if appropriate */
200
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
201
	{
202
		if (Log_autovacuum_min_duration == 0 ||
203
			TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(),
204
									   Log_autovacuum_min_duration))
205 206 207 208 209 210 211 212 213
			ereport(LOG,
					(errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
							"pages: %d removed, %d remain\n"
							"tuples: %.0f removed, %.0f remain\n"
							"system usage: %s",
							get_database_name(MyDatabaseId),
							get_namespace_name(RelationGetNamespace(onerel)),
							RelationGetRelationName(onerel),
							vacrelstats->num_index_scans,
B
Bruce Momjian 已提交
214 215
						  vacrelstats->pages_removed, vacrelstats->rel_pages,
						vacrelstats->tuples_deleted, vacrelstats->rel_tuples,
216 217
							pg_rusage_show(&ru0))));
	}
218 219 220 221 222 223 224 225 226
}


/*
 *	lazy_scan_heap() -- scan an open heap relation
 *
 *		This routine sets commit status bits, builds lists of dead tuples
 *		and pages with free space, and calculates statistics on the number
 *		of live tuples in the heap.  When done, or when we run low on space
227
 *		for dead-tuple TIDs, invoke vacuuming of indexes and heap.
228
 *
229 230
 *		If there are no indexes then we just vacuum each dirty page as we
 *		process it, since there's no point in gathering many tuples.
231 232 233
 */
static void
lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
234
			   Relation *Irel, int nindexes)
235 236 237 238 239
{
	BlockNumber nblocks,
				blkno;
	HeapTupleData tuple;
	char	   *relname;
240 241
	BlockNumber empty_pages,
				vacuumed_pages;
242 243 244 245
	double		num_tuples,
				tups_vacuumed,
				nkeep,
				nunused;
246
	IndexBulkDeleteResult **indstats;
247
	int			i;
248
	PGRUsage	ru0;
249

250
	pg_rusage_init(&ru0);
251 252

	relname = RelationGetRelationName(onerel);
253 254 255 256
	ereport(elevel,
			(errmsg("vacuuming \"%s.%s\"",
					get_namespace_name(RelationGetNamespace(onerel)),
					relname)));
257

258
	empty_pages = vacuumed_pages = 0;
259 260
	num_tuples = tups_vacuumed = nkeep = nunused = 0;

261 262
	indstats = (IndexBulkDeleteResult **)
		palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
263

264 265 266 267
	nblocks = RelationGetNumberOfBlocks(onerel);
	vacrelstats->rel_pages = nblocks;
	vacrelstats->nonempty_pages = 0;

268
	lazy_space_alloc(vacrelstats, nblocks);
269 270 271 272 273 274 275

	for (blkno = 0; blkno < nblocks; blkno++)
	{
		Buffer		buf;
		Page		page;
		OffsetNumber offnum,
					maxoff;
276
		bool		tupgone,
277 278
					hastup;
		int			prev_dead_count;
279 280
		OffsetNumber frozen[MaxOffsetNumber];
		int			nfrozen;
281
		Size		freespace;
282

283
		vacuum_delay_point();
J
Jan Wieck 已提交
284

285
		/*
B
Bruce Momjian 已提交
286 287
		 * If we are close to overrunning the available space for dead-tuple
		 * TIDs, pause and do a cycle of vacuuming before we tackle this page.
288
		 */
289
		if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
290 291 292 293
			vacrelstats->num_dead_tuples > 0)
		{
			/* Remove index entries */
			for (i = 0; i < nindexes; i++)
294
				lazy_vacuum_index(Irel[i],
295
								  &indstats[i],
296
								  vacrelstats);
297 298 299 300
			/* Remove tuples from heap */
			lazy_vacuum_heap(onerel, vacrelstats);
			/* Forget the now-vacuumed tuples, and press on */
			vacrelstats->num_dead_tuples = 0;
301
			vacrelstats->num_index_scans++;
302 303
		}

304 305
		buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
								 RBM_NORMAL, vac_strategy);
306

307 308
		/* We need buffer cleanup lock so that we can prune HOT chains. */
		LockBufferForCleanup(buf);
309 310 311 312 313

		page = BufferGetPage(buf);

		if (PageIsNew(page))
		{
314
			/*
B
Bruce Momjian 已提交
315 316 317
			 * An all-zeroes page could be left over if a backend extends the
			 * relation but crashes before initializing the page. Reclaim such
			 * pages for use.
318
			 *
319 320 321
			 * We have to be careful here because we could be looking at a
			 * page that someone has just added to the relation and not yet
			 * been able to initialize (see RelationGetBufferForTuple). To
322
			 * protect against that, release the buffer lock, grab the
B
Bruce Momjian 已提交
323 324 325
			 * relation extension lock momentarily, and re-lock the buffer. If
			 * the page is still uninitialized by then, it must be left over
			 * from a crashed backend, and we can initialize it.
326
			 *
327 328 329
			 * We don't really need the relation lock when this is a new or
			 * temp relation, but it's probably not worth the code space to
			 * check that, since this surely isn't a critical path.
330
			 *
331 332
			 * Note: the comparable code in vacuum.c need not worry because
			 * it's got exclusive lock on the whole relation.
333
			 */
334
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
335 336
			LockRelationForExtension(onerel, ExclusiveLock);
			UnlockRelationForExtension(onerel, ExclusiveLock);
337
			LockBufferForCleanup(buf);
338 339
			if (PageIsNew(page))
			{
340
				ereport(WARNING,
B
Bruce Momjian 已提交
341 342
				(errmsg("relation \"%s\" page %u is uninitialized --- fixing",
						relname, blkno)));
343
				PageInit(page, BufferGetPageSize(buf), 0);
344
				empty_pages++;
345
			}
346
			freespace = PageGetHeapFreeSpace(page);
347 348
			MarkBufferDirty(buf);
			UnlockReleaseBuffer(buf);
349 350

			RecordPageWithFreeSpace(onerel, blkno, freespace);
351 352 353 354 355 356
			continue;
		}

		if (PageIsEmpty(page))
		{
			empty_pages++;
357
			freespace = PageGetHeapFreeSpace(page);
358
			UnlockReleaseBuffer(buf);
359
			RecordPageWithFreeSpace(onerel, blkno, freespace);
360 361 362
			continue;
		}

B
Bruce Momjian 已提交
363
		/*
364 365 366 367 368 369 370 371
		 * Prune all HOT-update chains in this page.
		 *
		 * We count tuples removed by the pruning step as removed by VACUUM.
		 */
		tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
										 false, false);

		/*
B
Bruce Momjian 已提交
372 373
		 * Now scan the page to collect vacuumable items and check for tuples
		 * requiring freezing.
374
		 */
375
		nfrozen = 0;
376 377 378 379 380 381 382 383 384 385 386
		hastup = false;
		prev_dead_count = vacrelstats->num_dead_tuples;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			ItemId		itemid;

			itemid = PageGetItemId(page, offnum);

387
			/* Unused items require no processing, but we count 'em */
388 389 390 391 392 393
			if (!ItemIdIsUsed(itemid))
			{
				nunused += 1;
				continue;
			}

394
			/* Redirect items mustn't be touched */
B
Bruce Momjian 已提交
395 396
			if (ItemIdIsRedirected(itemid))
			{
397
				hastup = true;	/* this page won't be truncatable */
B
Bruce Momjian 已提交
398 399
				continue;
			}
400

B
Bruce Momjian 已提交
401
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
402 403 404

			/*
			 * DEAD item pointers are to be vacuumed normally; but we don't
B
Bruce Momjian 已提交
405 406 407
			 * count them in tups_vacuumed, else we'd be double-counting (at
			 * least in the common case where heap_page_prune() just freed up
			 * a non-HOT tuple).
408 409 410 411 412 413 414 415 416
			 */
			if (ItemIdIsDead(itemid))
			{
				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
				continue;
			}

			Assert(ItemIdIsNormal(itemid));

417 418 419 420 421
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);

			tupgone = false;

422
			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
423 424
			{
				case HEAPTUPLE_DEAD:
B
Bruce Momjian 已提交
425

426 427 428 429 430 431 432 433 434
					/*
					 * Ordinarily, DEAD tuples would have been removed by
					 * heap_page_prune(), but it's possible that the tuple
					 * state changed since heap_page_prune() looked.  In
					 * particular an INSERT_IN_PROGRESS tuple could have
					 * changed to DEAD if the inserter aborted.  So this
					 * cannot be considered an error condition.
					 *
					 * If the tuple is HOT-updated then it must only be
B
Bruce Momjian 已提交
435 436 437 438 439
					 * removed by a prune operation; so we keep it just as if
					 * it were RECENTLY_DEAD.  Also, if it's a heap-only
					 * tuple, we choose to keep it, because it'll be a lot
					 * cheaper to get rid of it in the next pruning pass than
					 * to treat it like an indexed tuple.
440 441 442 443 444
					 */
					if (HeapTupleIsHotUpdated(&tuple) ||
						HeapTupleIsHeapOnly(&tuple))
						nkeep += 1;
					else
B
Bruce Momjian 已提交
445
						tupgone = true; /* we can delete the tuple */
446 447
					break;
				case HEAPTUPLE_LIVE:
448
					/* Tuple is good --- but let's do some validity checks */
449 450 451 452
					if (onerel->rd_rel->relhasoids &&
						!OidIsValid(HeapTupleGetOid(&tuple)))
						elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
							 relname, blkno, offnum);
453 454
					break;
				case HEAPTUPLE_RECENTLY_DEAD:
455

456
					/*
B
Bruce Momjian 已提交
457 458
					 * If tuple is recently deleted then we must not remove it
					 * from relation.
459 460 461 462 463 464 465 466 467 468
					 */
					nkeep += 1;
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				case HEAPTUPLE_DELETE_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				default:
469
					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
470 471 472 473 474 475 476 477 478 479 480 481
					break;
			}

			if (tupgone)
			{
				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
				tups_vacuumed += 1;
			}
			else
			{
				num_tuples += 1;
				hastup = true;
482

B
Bruce Momjian 已提交
483
				/*
B
Bruce Momjian 已提交
484 485
				 * Each non-removable tuple must be checked to see if it needs
				 * freezing.  Note we already have exclusive buffer lock.
486
				 */
487
				if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
488
									  InvalidBuffer))
489
					frozen[nfrozen++] = offnum;
490
			}
491
		}						/* scan along page */
492

493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
		/*
		 * If we froze any tuples, mark the buffer dirty, and write a WAL
		 * record recording the changes.  We must log the changes to be
		 * crash-safe against future truncation of CLOG.
		 */
		if (nfrozen > 0)
		{
			MarkBufferDirty(buf);
			/* no XLOG for temp tables, though */
			if (!onerel->rd_istemp)
			{
				XLogRecPtr	recptr;

				recptr = log_heap_freeze(onerel, buf, FreezeLimit,
										 frozen, nfrozen);
				PageSetLSN(page, recptr);
				PageSetTLI(page, ThisTimeLineID);
			}
		}

513 514 515 516 517 518 519 520 521 522 523 524 525 526
		/*
		 * If there are no indexes then we can vacuum the page right now
		 * instead of doing a second scan.
		 */
		if (nindexes == 0 &&
			vacrelstats->num_dead_tuples > 0)
		{
			/* Remove tuples from heap */
			lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
			/* Forget the now-vacuumed tuples, and press on */
			vacrelstats->num_dead_tuples = 0;
			vacuumed_pages++;
		}

527 528 529 530 531 532 533 534
		freespace = PageGetHeapFreeSpace(page);

		/* Remember the location of the last page with nonremovable tuples */
		if (hastup)
			vacrelstats->nonempty_pages = blkno + 1;

		UnlockReleaseBuffer(buf);

535
		/*
536
		 * If we remembered any tuples for deletion, then the page will be
B
Bruce Momjian 已提交
537 538
		 * visited again by lazy_vacuum_heap, which will compute and record
		 * its post-compaction free space.	If not, then we're done with this
B
Bruce Momjian 已提交
539 540
		 * page, so remember its free space as-is.	(This path will always be
		 * taken if there are no indexes.)
541 542
		 */
		if (vacrelstats->num_dead_tuples == prev_dead_count)
543
			RecordPageWithFreeSpace(onerel, blkno, freespace);
544 545
	}

546 547
	/* save stats for use later */
	vacrelstats->rel_tuples = num_tuples;
548
	vacrelstats->tuples_deleted = tups_vacuumed;
549

550
	/* If any tuples need to be deleted, perform final vacuum cycle */
551
	/* XXX put a threshold on min number of tuples here? */
552 553 554 555
	if (vacrelstats->num_dead_tuples > 0)
	{
		/* Remove index entries */
		for (i = 0; i < nindexes; i++)
556
			lazy_vacuum_index(Irel[i],
557
							  &indstats[i],
558
							  vacrelstats);
559 560
		/* Remove tuples from heap */
		lazy_vacuum_heap(onerel, vacrelstats);
561
		vacrelstats->num_index_scans++;
562
	}
563 564 565 566

	/* Do post-vacuum cleanup and statistics update for each index */
	for (i = 0; i < nindexes; i++)
		lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
567

568 569 570 571 572 573 574
	/* If no indexes, make log report that lazy_vacuum_heap would've made */
	if (vacuumed_pages)
		ereport(elevel,
				(errmsg("\"%s\": removed %.0f row versions in %u pages",
						RelationGetRelationName(onerel),
						tups_vacuumed, vacuumed_pages)));

575
	ereport(elevel,
576
			(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
577 578
					RelationGetRelationName(onerel),
					tups_vacuumed, num_tuples, nblocks),
579
			 errdetail("%.0f dead row versions cannot be removed yet.\n"
580 581
					   "There were %.0f unused item pointers.\n"
					   "%u pages are entirely empty.\n"
582
					   "%s.",
583 584 585
					   nkeep,
					   nunused,
					   empty_pages,
586
					   pg_rusage_show(&ru0))));
587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
}


/*
 *	lazy_vacuum_heap() -- second pass over the heap
 *
 *		This routine marks dead tuples as unused and compacts out free
 *		space on their pages.  Pages not having dead tuples recorded from
 *		lazy_scan_heap are not visited at all.
 *
 * Note: the reason for doing this as a second pass is we cannot remove
 * the tuples until we've removed their index entries, and we want to
 * process index entry removal in batches as large as possible.
 */
static void
lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
{
	int			tupindex;
	int			npages;
606
	PGRUsage	ru0;
607

608
	pg_rusage_init(&ru0);
609 610 611 612 613
	npages = 0;

	tupindex = 0;
	while (tupindex < vacrelstats->num_dead_tuples)
	{
614
		BlockNumber tblk;
615 616
		Buffer		buf;
		Page		page;
617
		Size		freespace;
618

619
		vacuum_delay_point();
J
Jan Wieck 已提交
620

621
		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
622 623
		buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
								 vac_strategy);
624 625
		LockBufferForCleanup(buf);
		tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
626

627 628
		/* Now that we've compacted the page, record its available space */
		page = BufferGetPage(buf);
629 630
		freespace = PageGetHeapFreeSpace(page);

631
		UnlockReleaseBuffer(buf);
632
		RecordPageWithFreeSpace(onerel, tblk, freespace);
633 634 635
		npages++;
	}

636
	ereport(elevel,
637
			(errmsg("\"%s\": removed %d row versions in %d pages",
638 639
					RelationGetRelationName(onerel),
					tupindex, npages),
640 641
			 errdetail("%s.",
					   pg_rusage_show(&ru0))));
642 643 644 645 646 647
}

/*
 *	lazy_vacuum_page() -- free dead tuples on a page
 *					 and repair its fragmentation.
 *
648
 * Caller must hold pin and buffer cleanup lock on the buffer.
649 650 651 652 653 654 655 656 657 658
 *
 * tupindex is the index in vacrelstats->dead_tuples of the first dead
 * tuple for this page.  We assume the rest follow sequentially.
 * The return value is the first tupindex after the tuples of this page.
 */
static int
lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
				 int tupindex, LVRelStats *vacrelstats)
{
	Page		page = BufferGetPage(buffer);
659 660
	OffsetNumber unused[MaxOffsetNumber];
	int			uncnt = 0;
661 662

	START_CRIT_SECTION();
663

664 665
	for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
	{
666 667
		BlockNumber tblk;
		OffsetNumber toff;
668
		ItemId		itemid;
669 670 671 672 673 674

		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
		if (tblk != blkno)
			break;				/* past end of tuples for this block */
		toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
		itemid = PageGetItemId(page, toff);
675
		ItemIdSetUnused(itemid);
676
		unused[uncnt++] = toff;
677 678
	}

679
	PageRepairFragmentation(page);
680

681 682
	MarkBufferDirty(buffer);

683 684
	/* XLOG stuff */
	if (!onerel->rd_istemp)
685 686 687
	{
		XLogRecPtr	recptr;

688 689 690 691
		recptr = log_heap_clean(onerel, buffer,
								NULL, 0, NULL, 0,
								unused, uncnt,
								false);
692
		PageSetLSN(page, recptr);
693
		PageSetTLI(page, ThisTimeLineID);
694
	}
695

696 697 698 699 700
	END_CRIT_SECTION();

	return tupindex;
}

701
/*
702
 *	lazy_vacuum_index() -- vacuum one index relation.
703
 *
704 705
 *		Delete all the index entries pointing to tuples listed in
 *		vacrelstats->dead_tuples, and update running statistics.
706 707
 */
static void
708 709 710
lazy_vacuum_index(Relation indrel,
				  IndexBulkDeleteResult **stats,
				  LVRelStats *vacrelstats)
711
{
712
	IndexVacuumInfo ivinfo;
713
	PGRUsage	ru0;
714

715
	pg_rusage_init(&ru0);
716

717 718 719 720 721
	ivinfo.index = indrel;
	ivinfo.vacuum_full = false;
	ivinfo.message_level = elevel;
	/* We don't yet know rel_tuples, so pass -1 */
	ivinfo.num_heap_tuples = -1;
722
	ivinfo.strategy = vac_strategy;
723

724 725 726
	/* Do bulk deletion */
	*stats = index_bulk_delete(&ivinfo, *stats,
							   lazy_tid_reaped, (void *) vacrelstats);
727

728
	ereport(elevel,
729
			(errmsg("scanned index \"%s\" to remove %d row versions",
B
Bruce Momjian 已提交
730
					RelationGetRelationName(indrel),
731 732
					vacrelstats->num_dead_tuples),
			 errdetail("%s.", pg_rusage_show(&ru0))));
733 734
}

735
/*
736
 *	lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
737 738
 */
static void
739 740 741
lazy_cleanup_index(Relation indrel,
				   IndexBulkDeleteResult *stats,
				   LVRelStats *vacrelstats)
742
{
743
	IndexVacuumInfo ivinfo;
744
	PGRUsage	ru0;
745

746
	pg_rusage_init(&ru0);
747

748 749 750 751
	ivinfo.index = indrel;
	ivinfo.vacuum_full = false;
	ivinfo.message_level = elevel;
	ivinfo.num_heap_tuples = vacrelstats->rel_tuples;
752
	ivinfo.strategy = vac_strategy;
753

754
	stats = index_vacuum_cleanup(&ivinfo, stats);
755 756 757 758

	if (!stats)
		return;

759
	/* now update statistics in pg_class */
760 761 762
	vac_update_relstats(RelationGetRelid(indrel),
						stats->num_pages,
						stats->num_index_tuples,
763
						false, InvalidTransactionId);
764

765
	ereport(elevel,
B
Bruce Momjian 已提交
766 767 768 769 770 771 772 773 774 775
			(errmsg("index \"%s\" now contains %.0f row versions in %u pages",
					RelationGetRelationName(indrel),
					stats->num_index_tuples,
					stats->num_pages),
			 errdetail("%.0f index row versions were removed.\n"
			 "%u index pages have been deleted, %u are currently reusable.\n"
					   "%s.",
					   stats->tuples_removed,
					   stats->pages_deleted, stats->pages_free,
					   pg_rusage_show(&ru0))));
776

777
	pfree(stats);
778 779 780 781 782 783
}

/*
 * lazy_truncate_heap - try to truncate off any empty pages at the end
 */
static void
784
lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
785
{
786 787
	BlockNumber old_rel_pages = vacrelstats->rel_pages;
	BlockNumber new_rel_pages;
788
	PGRUsage	ru0;
789

790
	pg_rusage_init(&ru0);
791 792

	/*
B
Bruce Momjian 已提交
793 794 795 796
	 * We need full exclusive lock on the relation in order to do truncation.
	 * If we can't get it, give up rather than waiting --- we don't want to
	 * block other backends, and we don't want to deadlock (which is quite
	 * possible considering we already hold a lower-grade lock).
797
	 */
798
	if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
799 800 801 802
		return;

	/*
	 * Now that we have exclusive lock, look to see if the rel has grown
B
Bruce Momjian 已提交
803 804
	 * whilst we were vacuuming with non-exclusive lock.  If so, give up; the
	 * newly added pages presumably contain non-deletable tuples.
805 806 807 808 809 810 811 812 813 814 815 816
	 */
	new_rel_pages = RelationGetNumberOfBlocks(onerel);
	if (new_rel_pages != old_rel_pages)
	{
		/* might as well use the latest news when we update pg_class stats */
		vacrelstats->rel_pages = new_rel_pages;
		UnlockRelation(onerel, AccessExclusiveLock);
		return;
	}

	/*
	 * Scan backwards from the end to verify that the end pages actually
817 818 819
	 * contain no tuples.  This is *necessary*, not optional, because other
	 * backends could have added tuples to these pages whilst we were
	 * vacuuming.
820
	 */
821
	new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
822 823 824 825 826 827 828 829 830 831 832

	if (new_rel_pages >= old_rel_pages)
	{
		/* can't do anything after all */
		UnlockRelation(onerel, AccessExclusiveLock);
		return;
	}

	/*
	 * Okay to truncate.
	 */
833
	FreeSpaceMapTruncateRel(onerel, new_rel_pages);
834
	RelationTruncate(onerel, new_rel_pages);
835

836
	/*
B
Bruce Momjian 已提交
837 838 839 840 841
	 * Note: once we have truncated, we *must* keep the exclusive lock until
	 * commit.	The sinval message that will be sent at commit (as a result of
	 * vac_update_relstats()) must be received by other backends, to cause
	 * them to reset their rd_targblock values, before they can safely access
	 * the table again.
842
	 */
843

844 845 846 847
	/* update statistics */
	vacrelstats->rel_pages = new_rel_pages;
	vacrelstats->pages_removed = old_rel_pages - new_rel_pages;

848 849 850 851
	ereport(elevel,
			(errmsg("\"%s\": truncated %u to %u pages",
					RelationGetRelationName(onerel),
					old_rel_pages, new_rel_pages),
852 853
			 errdetail("%s.",
					   pg_rusage_show(&ru0))));
854 855 856
}

/*
857
 * Rescan end pages to verify that they are (still) empty of tuples.
858 859 860 861
 *
 * Returns number of nondeletable pages (last nonempty page + 1).
 */
static BlockNumber
862
count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
863 864 865 866 867 868 869 870 871 872 873
{
	BlockNumber blkno;

	/* Strange coding of loop control is needed because blkno is unsigned */
	blkno = vacrelstats->rel_pages;
	while (blkno > vacrelstats->nonempty_pages)
	{
		Buffer		buf;
		Page		page;
		OffsetNumber offnum,
					maxoff;
874
		bool		hastup;
875

876 877
		/*
		 * We don't insert a vacuum delay point here, because we have an
B
Bruce Momjian 已提交
878 879
		 * exclusive lock on the table which we want to hold for as short a
		 * time as possible.  We still need to check for interrupts however.
880
		 */
881
		CHECK_FOR_INTERRUPTS();
J
Jan Wieck 已提交
882

883 884
		blkno--;

885 886
		buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
								 RBM_NORMAL, vac_strategy);
887 888 889 890 891 892 893 894

		/* In this phase we only need shared access to the buffer */
		LockBuffer(buf, BUFFER_LOCK_SHARE);

		page = BufferGetPage(buf);

		if (PageIsNew(page) || PageIsEmpty(page))
		{
895
			/* PageIsNew probably shouldn't happen... */
896
			UnlockReleaseBuffer(buf);
897 898 899 900 901 902 903 904 905 906 907 908 909
			continue;
		}

		hastup = false;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			ItemId		itemid;

			itemid = PageGetItemId(page, offnum);

910 911 912 913 914 915 916
			/*
			 * Note: any non-unused item should be taken as a reason to keep
			 * this page.  We formerly thought that DEAD tuples could be
			 * thrown away, but that's not so, because we'd not have cleaned
			 * out their index entries.
			 */
			if (ItemIdIsUsed(itemid))
917 918 919 920
			{
				hastup = true;
				break;			/* can stop scanning */
			}
921
		}						/* scan along page */
922

923
		UnlockReleaseBuffer(buf);
924 925 926 927 928 929 930 931

		/* Done scanning if we found a tuple here */
		if (hastup)
			return blkno + 1;
	}

	/*
	 * If we fall out of the loop, all the previously-thought-to-be-empty
932
	 * pages still are; we need not bother to look at the last known-nonempty
B
Bruce Momjian 已提交
933
	 * page.
934 935 936 937 938 939 940 941 942 943
	 */
	return vacrelstats->nonempty_pages;
}

/*
 * lazy_space_alloc - space allocation decisions for lazy vacuum
 *
 * See the comments at the head of this file for rationale.
 */
static void
944
lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
945
{
946
	long		maxtuples;
947

948 949
	if (vacrelstats->hasindex)
	{
950 951 952
		maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
		maxtuples = Min(maxtuples, INT_MAX);
		maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
953 954 955 956 957

		/* curious coding here to ensure the multiplication can't overflow */
		if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
			maxtuples = relblocks * LAZY_ALLOC_TUPLES;

958 959
		/* stay sane if small maintenance_work_mem */
		maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
960 961 962
	}
	else
	{
963 964
		maxtuples = MaxHeapTuplesPerPage;
	}
965 966

	vacrelstats->num_dead_tuples = 0;
967
	vacrelstats->max_dead_tuples = (int) maxtuples;
968 969 970 971 972 973 974 975 976 977 978 979
	vacrelstats->dead_tuples = (ItemPointer)
		palloc(maxtuples * sizeof(ItemPointerData));
}

/*
 * lazy_record_dead_tuple - remember one deletable tuple
 */
static void
lazy_record_dead_tuple(LVRelStats *vacrelstats,
					   ItemPointer itemptr)
{
	/*
980
	 * The array shouldn't overflow under normal behavior, but perhaps it
981
	 * could if we are given a really small maintenance_work_mem. In that
982
	 * case, just forget the last few tuples (we'll get 'em next time).
983 984 985 986 987 988 989 990 991 992 993
	 */
	if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
	{
		vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
		vacrelstats->num_dead_tuples++;
	}
}

/*
 *	lazy_tid_reaped() -- is a particular tid deletable?
 *
994 995
 *		This has the right signature to be an IndexBulkDeleteCallback.
 *
996 997 998
 *		Assumes dead_tuples array is in sorted order.
 */
static bool
999
lazy_tid_reaped(ItemPointer itemptr, void *state)
1000
{
1001
	LVRelStats *vacrelstats = (LVRelStats *) state;
1002
	ItemPointer res;
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041

	res = (ItemPointer) bsearch((void *) itemptr,
								(void *) vacrelstats->dead_tuples,
								vacrelstats->num_dead_tuples,
								sizeof(ItemPointerData),
								vac_cmp_itemptr);

	return (res != NULL);
}

/*
 * Comparator routines for use with qsort() and bsearch().
 */
static int
vac_cmp_itemptr(const void *left, const void *right)
{
	BlockNumber lblk,
				rblk;
	OffsetNumber loff,
				roff;

	lblk = ItemPointerGetBlockNumber((ItemPointer) left);
	rblk = ItemPointerGetBlockNumber((ItemPointer) right);

	if (lblk < rblk)
		return -1;
	if (lblk > rblk)
		return 1;

	loff = ItemPointerGetOffsetNumber((ItemPointer) left);
	roff = ItemPointerGetOffsetNumber((ItemPointer) right);

	if (loff < roff)
		return -1;
	if (loff > roff)
		return 1;

	return 0;
}