vacuumlazy.c 29.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*-------------------------------------------------------------------------
 *
 * vacuumlazy.c
 *	  Concurrent ("lazy") vacuuming.
 *
 *
 * The major space usage for LAZY VACUUM is storage for the array of dead
 * tuple TIDs, with the next biggest need being storage for per-disk-page
 * free space info.  We want to ensure we can vacuum even the very largest
 * relations with finite memory space usage.  To do that, we set upper bounds
 * on the number of tuples and pages we will keep track of at once.
 *
13
 * We are willing to use at most maintenance_work_mem memory space to keep
14 15
 * track of dead tuples.  We initially allocate an array of TIDs of that size,
 * with an upper limit that depends on table size (this limit ensures we don't
B
Bruce Momjian 已提交
16
 * allocate a huge area uselessly for vacuuming small tables).	If the array
17 18 19
 * threatens to overflow, we suspend the heap scan phase and perform a pass of
 * index cleanup and page compaction, then resume the heap scan with an empty
 * TID array.
20
 *
21 22 23 24 25
 * If we're processing a table with no indexes, we can just vacuum each page
 * as we go; there's no need to save up multiple tuples to minimize the number
 * of index scans performed.  So we don't use maintenance_work_mem memory for
 * the TID array, just enough to hold as many heap tuples as fit on one page.
 *
26
 *
27
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
28 29 30 31
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
32
 *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.111 2008/11/19 10:34:51 heikki Exp $
33 34 35 36 37
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

38 39
#include <math.h>

40 41
#include "access/genam.h"
#include "access/heapam.h"
42
#include "access/transam.h"
43
#include "catalog/storage.h"
44
#include "commands/dbcommands.h"
45 46
#include "commands/vacuum.h"
#include "miscadmin.h"
47
#include "pgstat.h"
48
#include "postmaster/autovacuum.h"
49
#include "storage/bufmgr.h"
50
#include "storage/freespace.h"
51
#include "storage/lmgr.h"
52
#include "utils/lsyscache.h"
53
#include "utils/memutils.h"
54
#include "utils/pg_rusage.h"
55
#include "utils/tqual.h"
56 57 58 59 60 61


/*
 * Space/time tradeoff parameters: do these need to be user-tunable?
 *
 * To consider truncating the relation, we want there to be at least
62 63
 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
 * is less) potentially-freeable pages.
64
 */
65
#define REL_TRUNCATE_MINIMUM	1000
66 67
#define REL_TRUNCATE_FRACTION	16

68 69 70 71 72
/*
 * Guesstimation of number of dead tuples per page.  This is used to
 * provide an upper limit to memory allocated when vacuuming small
 * tables.
 */
73
#define LAZY_ALLOC_TUPLES		MaxHeapTuplesPerPage
74 75 76

typedef struct LVRelStats
{
77 78
	/* hasindex = true means two-pass strategy; false means one-pass */
	bool		hasindex;
79
	/* Overall statistics about rel */
80
	BlockNumber rel_pages;
81
	double		rel_tuples;
B
Bruce Momjian 已提交
82
	BlockNumber pages_removed;
83
	double		tuples_deleted;
84
	BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
85 86
	/* List of TIDs of tuples we intend to delete */
	/* NB: this list is ordered by TID address */
87 88
	int			num_dead_tuples;	/* current # of entries */
	int			max_dead_tuples;	/* # slots allocated in array */
89
	ItemPointer dead_tuples;	/* array of ItemPointerData */
90
	int			num_index_scans;
91 92 93
} LVRelStats;


94
/* A few variables that don't seem worth passing around as parameters */
B
Bruce Momjian 已提交
95
static int	elevel = -1;
96

97 98 99
static TransactionId OldestXmin;
static TransactionId FreezeLimit;

100 101
static BufferAccessStrategy vac_strategy;

102 103 104

/* non-export function prototypes */
static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
105
			   Relation *Irel, int nindexes);
106
static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
107
static void lazy_vacuum_index(Relation indrel,
B
Bruce Momjian 已提交
108 109
				  IndexBulkDeleteResult **stats,
				  LVRelStats *vacrelstats);
110
static void lazy_cleanup_index(Relation indrel,
B
Bruce Momjian 已提交
111 112
				   IndexBulkDeleteResult *stats,
				   LVRelStats *vacrelstats);
113 114
static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
				 int tupindex, LVRelStats *vacrelstats);
115
static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
116
static BlockNumber count_nondeletable_pages(Relation onerel,
117
						 LVRelStats *vacrelstats);
118
static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
119
static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
120
					   ItemPointer itemptr);
121
static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
122 123 124 125 126 127 128
static int	vac_cmp_itemptr(const void *left, const void *right);


/*
 *	lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
 *
 *		This routine vacuums a single heap, cleans out its indexes, and
129
 *		updates its relpages and reltuples statistics.
130 131 132 133 134
 *
 *		At entry, we have already established a transaction and opened
 *		and locked the relation.
 */
void
135 136
lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
				BufferAccessStrategy bstrategy)
137 138 139 140
{
	LVRelStats *vacrelstats;
	Relation   *Irel;
	int			nindexes;
141
	BlockNumber possibly_freeable;
142
	PGRUsage	ru0;
B
Bruce Momjian 已提交
143
	TimestampTz starttime = 0;
144 145 146 147

	pg_rusage_init(&ru0);

	/* measure elapsed time iff autovacuum logging requires it */
148
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0)
149
		starttime = GetCurrentTimestamp();
150 151

	if (vacstmt->verbose)
152
		elevel = INFO;
153
	else
154
		elevel = DEBUG2;
B
Bruce Momjian 已提交
155

156 157
	vac_strategy = bstrategy;

158
	vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
159
						  &OldestXmin, &FreezeLimit);
160

161
	vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
162

163 164
	vacrelstats->num_index_scans = 0;

165
	/* Open all indexes of the relation */
166
	vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
167
	vacrelstats->hasindex = (nindexes > 0);
168 169

	/* Do the vacuuming */
170
	lazy_scan_heap(onerel, vacrelstats, Irel, nindexes);
171 172

	/* Done with indexes */
173
	vac_close_indexes(nindexes, Irel, NoLock);
174 175 176 177

	/*
	 * Optionally truncate the relation.
	 *
178 179
	 * Don't even think about it unless we have a shot at releasing a goodly
	 * number of pages.  Otherwise, the time taken isn't worth it.
180 181
	 */
	possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
182
	if (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
B
Bruce Momjian 已提交
183
		possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)
184
		lazy_truncate_heap(onerel, vacrelstats);
185

186 187
	/* Vacuum the Free Space Map */
	FreeSpaceMapVacuum(onerel);
188

189
	/* Update statistics in pg_class */
190 191 192
	vac_update_relstats(onerel,
						vacrelstats->rel_pages, vacrelstats->rel_tuples,
						vacrelstats->hasindex, FreezeLimit);
193 194

	/* report results to the stats collector, too */
195
	pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
B
Bruce Momjian 已提交
196
						 vacstmt->analyze, vacrelstats->rel_tuples);
197 198

	/* and log the action if appropriate */
199
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
200
	{
201
		if (Log_autovacuum_min_duration == 0 ||
202
			TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(),
203
									   Log_autovacuum_min_duration))
204 205 206 207 208 209 210 211 212
			ereport(LOG,
					(errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
							"pages: %d removed, %d remain\n"
							"tuples: %.0f removed, %.0f remain\n"
							"system usage: %s",
							get_database_name(MyDatabaseId),
							get_namespace_name(RelationGetNamespace(onerel)),
							RelationGetRelationName(onerel),
							vacrelstats->num_index_scans,
B
Bruce Momjian 已提交
213 214
						  vacrelstats->pages_removed, vacrelstats->rel_pages,
						vacrelstats->tuples_deleted, vacrelstats->rel_tuples,
215 216
							pg_rusage_show(&ru0))));
	}
217 218 219 220 221 222 223 224 225
}


/*
 *	lazy_scan_heap() -- scan an open heap relation
 *
 *		This routine sets commit status bits, builds lists of dead tuples
 *		and pages with free space, and calculates statistics on the number
 *		of live tuples in the heap.  When done, or when we run low on space
226
 *		for dead-tuple TIDs, invoke vacuuming of indexes and heap.
227
 *
228 229
 *		If there are no indexes then we just vacuum each dirty page as we
 *		process it, since there's no point in gathering many tuples.
230 231 232
 */
static void
lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
233
			   Relation *Irel, int nindexes)
234 235 236 237 238
{
	BlockNumber nblocks,
				blkno;
	HeapTupleData tuple;
	char	   *relname;
239 240
	BlockNumber empty_pages,
				vacuumed_pages;
241 242 243 244
	double		num_tuples,
				tups_vacuumed,
				nkeep,
				nunused;
245
	IndexBulkDeleteResult **indstats;
246
	int			i;
247
	PGRUsage	ru0;
248

249
	pg_rusage_init(&ru0);
250 251

	relname = RelationGetRelationName(onerel);
252 253 254 255
	ereport(elevel,
			(errmsg("vacuuming \"%s.%s\"",
					get_namespace_name(RelationGetNamespace(onerel)),
					relname)));
256

257
	empty_pages = vacuumed_pages = 0;
258 259
	num_tuples = tups_vacuumed = nkeep = nunused = 0;

260 261
	indstats = (IndexBulkDeleteResult **)
		palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
262

263 264 265 266
	nblocks = RelationGetNumberOfBlocks(onerel);
	vacrelstats->rel_pages = nblocks;
	vacrelstats->nonempty_pages = 0;

267
	lazy_space_alloc(vacrelstats, nblocks);
268 269 270 271 272 273 274

	for (blkno = 0; blkno < nblocks; blkno++)
	{
		Buffer		buf;
		Page		page;
		OffsetNumber offnum,
					maxoff;
275
		bool		tupgone,
276 277
					hastup;
		int			prev_dead_count;
278 279
		OffsetNumber frozen[MaxOffsetNumber];
		int			nfrozen;
280
		Size		freespace;
281

282
		vacuum_delay_point();
J
Jan Wieck 已提交
283

284
		/*
B
Bruce Momjian 已提交
285 286
		 * If we are close to overrunning the available space for dead-tuple
		 * TIDs, pause and do a cycle of vacuuming before we tackle this page.
287
		 */
288
		if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
289 290 291 292
			vacrelstats->num_dead_tuples > 0)
		{
			/* Remove index entries */
			for (i = 0; i < nindexes; i++)
293
				lazy_vacuum_index(Irel[i],
294
								  &indstats[i],
295
								  vacrelstats);
296 297 298 299
			/* Remove tuples from heap */
			lazy_vacuum_heap(onerel, vacrelstats);
			/* Forget the now-vacuumed tuples, and press on */
			vacrelstats->num_dead_tuples = 0;
300
			vacrelstats->num_index_scans++;
301 302
		}

303 304
		buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
								 RBM_NORMAL, vac_strategy);
305

306 307
		/* We need buffer cleanup lock so that we can prune HOT chains. */
		LockBufferForCleanup(buf);
308 309 310 311 312

		page = BufferGetPage(buf);

		if (PageIsNew(page))
		{
313
			/*
B
Bruce Momjian 已提交
314 315 316
			 * An all-zeroes page could be left over if a backend extends the
			 * relation but crashes before initializing the page. Reclaim such
			 * pages for use.
317
			 *
318 319 320
			 * We have to be careful here because we could be looking at a
			 * page that someone has just added to the relation and not yet
			 * been able to initialize (see RelationGetBufferForTuple). To
321
			 * protect against that, release the buffer lock, grab the
B
Bruce Momjian 已提交
322 323 324
			 * relation extension lock momentarily, and re-lock the buffer. If
			 * the page is still uninitialized by then, it must be left over
			 * from a crashed backend, and we can initialize it.
325
			 *
326 327 328
			 * We don't really need the relation lock when this is a new or
			 * temp relation, but it's probably not worth the code space to
			 * check that, since this surely isn't a critical path.
329
			 *
330 331
			 * Note: the comparable code in vacuum.c need not worry because
			 * it's got exclusive lock on the whole relation.
332
			 */
333
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
334 335
			LockRelationForExtension(onerel, ExclusiveLock);
			UnlockRelationForExtension(onerel, ExclusiveLock);
336
			LockBufferForCleanup(buf);
337 338
			if (PageIsNew(page))
			{
339
				ereport(WARNING,
B
Bruce Momjian 已提交
340 341
				(errmsg("relation \"%s\" page %u is uninitialized --- fixing",
						relname, blkno)));
342
				PageInit(page, BufferGetPageSize(buf), 0);
343
				empty_pages++;
344
			}
345
			freespace = PageGetHeapFreeSpace(page);
346 347
			MarkBufferDirty(buf);
			UnlockReleaseBuffer(buf);
348 349

			RecordPageWithFreeSpace(onerel, blkno, freespace);
350 351 352 353 354 355
			continue;
		}

		if (PageIsEmpty(page))
		{
			empty_pages++;
356
			freespace = PageGetHeapFreeSpace(page);
357
			UnlockReleaseBuffer(buf);
358
			RecordPageWithFreeSpace(onerel, blkno, freespace);
359 360 361
			continue;
		}

B
Bruce Momjian 已提交
362
		/*
363 364 365 366 367 368 369 370
		 * Prune all HOT-update chains in this page.
		 *
		 * We count tuples removed by the pruning step as removed by VACUUM.
		 */
		tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin,
										 false, false);

		/*
B
Bruce Momjian 已提交
371 372
		 * Now scan the page to collect vacuumable items and check for tuples
		 * requiring freezing.
373
		 */
374
		nfrozen = 0;
375 376 377 378 379 380 381 382 383 384 385
		hastup = false;
		prev_dead_count = vacrelstats->num_dead_tuples;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			ItemId		itemid;

			itemid = PageGetItemId(page, offnum);

386
			/* Unused items require no processing, but we count 'em */
387 388 389 390 391 392
			if (!ItemIdIsUsed(itemid))
			{
				nunused += 1;
				continue;
			}

393
			/* Redirect items mustn't be touched */
B
Bruce Momjian 已提交
394 395
			if (ItemIdIsRedirected(itemid))
			{
396
				hastup = true;	/* this page won't be truncatable */
B
Bruce Momjian 已提交
397 398
				continue;
			}
399

B
Bruce Momjian 已提交
400
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
401 402 403

			/*
			 * DEAD item pointers are to be vacuumed normally; but we don't
B
Bruce Momjian 已提交
404 405 406
			 * count them in tups_vacuumed, else we'd be double-counting (at
			 * least in the common case where heap_page_prune() just freed up
			 * a non-HOT tuple).
407 408 409 410 411 412 413 414 415
			 */
			if (ItemIdIsDead(itemid))
			{
				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
				continue;
			}

			Assert(ItemIdIsNormal(itemid));

416 417 418 419 420
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);

			tupgone = false;

421
			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
422 423
			{
				case HEAPTUPLE_DEAD:
B
Bruce Momjian 已提交
424

425 426 427 428 429 430 431 432 433
					/*
					 * Ordinarily, DEAD tuples would have been removed by
					 * heap_page_prune(), but it's possible that the tuple
					 * state changed since heap_page_prune() looked.  In
					 * particular an INSERT_IN_PROGRESS tuple could have
					 * changed to DEAD if the inserter aborted.  So this
					 * cannot be considered an error condition.
					 *
					 * If the tuple is HOT-updated then it must only be
B
Bruce Momjian 已提交
434 435 436 437 438
					 * removed by a prune operation; so we keep it just as if
					 * it were RECENTLY_DEAD.  Also, if it's a heap-only
					 * tuple, we choose to keep it, because it'll be a lot
					 * cheaper to get rid of it in the next pruning pass than
					 * to treat it like an indexed tuple.
439 440 441 442 443
					 */
					if (HeapTupleIsHotUpdated(&tuple) ||
						HeapTupleIsHeapOnly(&tuple))
						nkeep += 1;
					else
B
Bruce Momjian 已提交
444
						tupgone = true; /* we can delete the tuple */
445 446
					break;
				case HEAPTUPLE_LIVE:
447
					/* Tuple is good --- but let's do some validity checks */
448 449 450 451
					if (onerel->rd_rel->relhasoids &&
						!OidIsValid(HeapTupleGetOid(&tuple)))
						elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
							 relname, blkno, offnum);
452 453
					break;
				case HEAPTUPLE_RECENTLY_DEAD:
454

455
					/*
B
Bruce Momjian 已提交
456 457
					 * If tuple is recently deleted then we must not remove it
					 * from relation.
458 459 460 461 462 463 464 465 466 467
					 */
					nkeep += 1;
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				case HEAPTUPLE_DELETE_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				default:
468
					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
469 470 471 472 473 474 475 476 477 478 479 480
					break;
			}

			if (tupgone)
			{
				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
				tups_vacuumed += 1;
			}
			else
			{
				num_tuples += 1;
				hastup = true;
481

B
Bruce Momjian 已提交
482
				/*
B
Bruce Momjian 已提交
483 484
				 * Each non-removable tuple must be checked to see if it needs
				 * freezing.  Note we already have exclusive buffer lock.
485
				 */
486
				if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
487
									  InvalidBuffer))
488
					frozen[nfrozen++] = offnum;
489
			}
490
		}						/* scan along page */
491

492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511
		/*
		 * If we froze any tuples, mark the buffer dirty, and write a WAL
		 * record recording the changes.  We must log the changes to be
		 * crash-safe against future truncation of CLOG.
		 */
		if (nfrozen > 0)
		{
			MarkBufferDirty(buf);
			/* no XLOG for temp tables, though */
			if (!onerel->rd_istemp)
			{
				XLogRecPtr	recptr;

				recptr = log_heap_freeze(onerel, buf, FreezeLimit,
										 frozen, nfrozen);
				PageSetLSN(page, recptr);
				PageSetTLI(page, ThisTimeLineID);
			}
		}

512 513 514 515 516 517 518 519 520 521 522 523 524 525
		/*
		 * If there are no indexes then we can vacuum the page right now
		 * instead of doing a second scan.
		 */
		if (nindexes == 0 &&
			vacrelstats->num_dead_tuples > 0)
		{
			/* Remove tuples from heap */
			lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
			/* Forget the now-vacuumed tuples, and press on */
			vacrelstats->num_dead_tuples = 0;
			vacuumed_pages++;
		}

526 527 528 529 530 531 532 533
		freespace = PageGetHeapFreeSpace(page);

		/* Remember the location of the last page with nonremovable tuples */
		if (hastup)
			vacrelstats->nonempty_pages = blkno + 1;

		UnlockReleaseBuffer(buf);

534
		/*
535
		 * If we remembered any tuples for deletion, then the page will be
B
Bruce Momjian 已提交
536 537
		 * visited again by lazy_vacuum_heap, which will compute and record
		 * its post-compaction free space.	If not, then we're done with this
B
Bruce Momjian 已提交
538 539
		 * page, so remember its free space as-is.	(This path will always be
		 * taken if there are no indexes.)
540 541
		 */
		if (vacrelstats->num_dead_tuples == prev_dead_count)
542
			RecordPageWithFreeSpace(onerel, blkno, freespace);
543 544
	}

545 546
	/* save stats for use later */
	vacrelstats->rel_tuples = num_tuples;
547
	vacrelstats->tuples_deleted = tups_vacuumed;
548

549
	/* If any tuples need to be deleted, perform final vacuum cycle */
550
	/* XXX put a threshold on min number of tuples here? */
551 552 553 554
	if (vacrelstats->num_dead_tuples > 0)
	{
		/* Remove index entries */
		for (i = 0; i < nindexes; i++)
555
			lazy_vacuum_index(Irel[i],
556
							  &indstats[i],
557
							  vacrelstats);
558 559
		/* Remove tuples from heap */
		lazy_vacuum_heap(onerel, vacrelstats);
560
		vacrelstats->num_index_scans++;
561
	}
562 563 564 565

	/* Do post-vacuum cleanup and statistics update for each index */
	for (i = 0; i < nindexes; i++)
		lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
566

567 568 569 570 571 572 573
	/* If no indexes, make log report that lazy_vacuum_heap would've made */
	if (vacuumed_pages)
		ereport(elevel,
				(errmsg("\"%s\": removed %.0f row versions in %u pages",
						RelationGetRelationName(onerel),
						tups_vacuumed, vacuumed_pages)));

574
	ereport(elevel,
575
			(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
576 577
					RelationGetRelationName(onerel),
					tups_vacuumed, num_tuples, nblocks),
578
			 errdetail("%.0f dead row versions cannot be removed yet.\n"
579 580
					   "There were %.0f unused item pointers.\n"
					   "%u pages are entirely empty.\n"
581
					   "%s.",
582 583 584
					   nkeep,
					   nunused,
					   empty_pages,
585
					   pg_rusage_show(&ru0))));
586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604
}


/*
 *	lazy_vacuum_heap() -- second pass over the heap
 *
 *		This routine marks dead tuples as unused and compacts out free
 *		space on their pages.  Pages not having dead tuples recorded from
 *		lazy_scan_heap are not visited at all.
 *
 * Note: the reason for doing this as a second pass is we cannot remove
 * the tuples until we've removed their index entries, and we want to
 * process index entry removal in batches as large as possible.
 */
static void
lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
{
	int			tupindex;
	int			npages;
605
	PGRUsage	ru0;
606

607
	pg_rusage_init(&ru0);
608 609 610 611 612
	npages = 0;

	tupindex = 0;
	while (tupindex < vacrelstats->num_dead_tuples)
	{
613
		BlockNumber tblk;
614 615
		Buffer		buf;
		Page		page;
616
		Size		freespace;
617

618
		vacuum_delay_point();
J
Jan Wieck 已提交
619

620
		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
621 622
		buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
								 vac_strategy);
623 624
		LockBufferForCleanup(buf);
		tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
625

626 627
		/* Now that we've compacted the page, record its available space */
		page = BufferGetPage(buf);
628 629
		freespace = PageGetHeapFreeSpace(page);

630
		UnlockReleaseBuffer(buf);
631
		RecordPageWithFreeSpace(onerel, tblk, freespace);
632 633 634
		npages++;
	}

635
	ereport(elevel,
636
			(errmsg("\"%s\": removed %d row versions in %d pages",
637 638
					RelationGetRelationName(onerel),
					tupindex, npages),
639 640
			 errdetail("%s.",
					   pg_rusage_show(&ru0))));
641 642 643 644 645 646
}

/*
 *	lazy_vacuum_page() -- free dead tuples on a page
 *					 and repair its fragmentation.
 *
647
 * Caller must hold pin and buffer cleanup lock on the buffer.
648 649 650 651 652 653 654 655 656 657
 *
 * tupindex is the index in vacrelstats->dead_tuples of the first dead
 * tuple for this page.  We assume the rest follow sequentially.
 * The return value is the first tupindex after the tuples of this page.
 */
static int
lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
				 int tupindex, LVRelStats *vacrelstats)
{
	Page		page = BufferGetPage(buffer);
658 659
	OffsetNumber unused[MaxOffsetNumber];
	int			uncnt = 0;
660 661

	START_CRIT_SECTION();
662

663 664
	for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
	{
665 666
		BlockNumber tblk;
		OffsetNumber toff;
667
		ItemId		itemid;
668 669 670 671 672 673

		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
		if (tblk != blkno)
			break;				/* past end of tuples for this block */
		toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
		itemid = PageGetItemId(page, toff);
674
		ItemIdSetUnused(itemid);
675
		unused[uncnt++] = toff;
676 677
	}

678
	PageRepairFragmentation(page);
679

680 681
	MarkBufferDirty(buffer);

682 683
	/* XLOG stuff */
	if (!onerel->rd_istemp)
684 685 686
	{
		XLogRecPtr	recptr;

687 688 689 690
		recptr = log_heap_clean(onerel, buffer,
								NULL, 0, NULL, 0,
								unused, uncnt,
								false);
691
		PageSetLSN(page, recptr);
692
		PageSetTLI(page, ThisTimeLineID);
693
	}
694

695 696 697 698 699
	END_CRIT_SECTION();

	return tupindex;
}

700
/*
701
 *	lazy_vacuum_index() -- vacuum one index relation.
702
 *
703 704
 *		Delete all the index entries pointing to tuples listed in
 *		vacrelstats->dead_tuples, and update running statistics.
705 706
 */
static void
707 708 709
lazy_vacuum_index(Relation indrel,
				  IndexBulkDeleteResult **stats,
				  LVRelStats *vacrelstats)
710
{
711
	IndexVacuumInfo ivinfo;
712
	PGRUsage	ru0;
713

714
	pg_rusage_init(&ru0);
715

716 717 718 719 720
	ivinfo.index = indrel;
	ivinfo.vacuum_full = false;
	ivinfo.message_level = elevel;
	/* We don't yet know rel_tuples, so pass -1 */
	ivinfo.num_heap_tuples = -1;
721
	ivinfo.strategy = vac_strategy;
722

723 724 725
	/* Do bulk deletion */
	*stats = index_bulk_delete(&ivinfo, *stats,
							   lazy_tid_reaped, (void *) vacrelstats);
726

727
	ereport(elevel,
728
			(errmsg("scanned index \"%s\" to remove %d row versions",
B
Bruce Momjian 已提交
729
					RelationGetRelationName(indrel),
730 731
					vacrelstats->num_dead_tuples),
			 errdetail("%s.", pg_rusage_show(&ru0))));
732 733
}

734
/*
735
 *	lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
736 737
 */
static void
738 739 740
lazy_cleanup_index(Relation indrel,
				   IndexBulkDeleteResult *stats,
				   LVRelStats *vacrelstats)
741
{
742
	IndexVacuumInfo ivinfo;
743
	PGRUsage	ru0;
744

745
	pg_rusage_init(&ru0);
746

747 748 749 750
	ivinfo.index = indrel;
	ivinfo.vacuum_full = false;
	ivinfo.message_level = elevel;
	ivinfo.num_heap_tuples = vacrelstats->rel_tuples;
751
	ivinfo.strategy = vac_strategy;
752

753
	stats = index_vacuum_cleanup(&ivinfo, stats);
754 755 756 757

	if (!stats)
		return;

758
	/* now update statistics in pg_class */
759 760
	vac_update_relstats(indrel,
						stats->num_pages, stats->num_index_tuples,
761
						false, InvalidTransactionId);
762

763
	ereport(elevel,
B
Bruce Momjian 已提交
764 765 766 767 768 769 770 771 772 773
			(errmsg("index \"%s\" now contains %.0f row versions in %u pages",
					RelationGetRelationName(indrel),
					stats->num_index_tuples,
					stats->num_pages),
			 errdetail("%.0f index row versions were removed.\n"
			 "%u index pages have been deleted, %u are currently reusable.\n"
					   "%s.",
					   stats->tuples_removed,
					   stats->pages_deleted, stats->pages_free,
					   pg_rusage_show(&ru0))));
774

775
	pfree(stats);
776 777 778 779 780 781
}

/*
 * lazy_truncate_heap - try to truncate off any empty pages at the end
 */
static void
782
lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
783
{
784 785
	BlockNumber old_rel_pages = vacrelstats->rel_pages;
	BlockNumber new_rel_pages;
786
	PGRUsage	ru0;
787

788
	pg_rusage_init(&ru0);
789 790

	/*
B
Bruce Momjian 已提交
791 792 793 794
	 * We need full exclusive lock on the relation in order to do truncation.
	 * If we can't get it, give up rather than waiting --- we don't want to
	 * block other backends, and we don't want to deadlock (which is quite
	 * possible considering we already hold a lower-grade lock).
795
	 */
796
	if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
797 798 799 800
		return;

	/*
	 * Now that we have exclusive lock, look to see if the rel has grown
B
Bruce Momjian 已提交
801 802
	 * whilst we were vacuuming with non-exclusive lock.  If so, give up; the
	 * newly added pages presumably contain non-deletable tuples.
803 804 805 806 807 808 809 810 811 812 813 814
	 */
	new_rel_pages = RelationGetNumberOfBlocks(onerel);
	if (new_rel_pages != old_rel_pages)
	{
		/* might as well use the latest news when we update pg_class stats */
		vacrelstats->rel_pages = new_rel_pages;
		UnlockRelation(onerel, AccessExclusiveLock);
		return;
	}

	/*
	 * Scan backwards from the end to verify that the end pages actually
815 816 817
	 * contain no tuples.  This is *necessary*, not optional, because other
	 * backends could have added tuples to these pages whilst we were
	 * vacuuming.
818
	 */
819
	new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
820 821 822 823 824 825 826 827 828 829 830

	if (new_rel_pages >= old_rel_pages)
	{
		/* can't do anything after all */
		UnlockRelation(onerel, AccessExclusiveLock);
		return;
	}

	/*
	 * Okay to truncate.
	 */
831
	RelationTruncate(onerel, new_rel_pages);
832

833
	/*
B
Bruce Momjian 已提交
834 835 836 837 838
	 * Note: once we have truncated, we *must* keep the exclusive lock until
	 * commit.	The sinval message that will be sent at commit (as a result of
	 * vac_update_relstats()) must be received by other backends, to cause
	 * them to reset their rd_targblock values, before they can safely access
	 * the table again.
839
	 */
840

841 842 843 844
	/* update statistics */
	vacrelstats->rel_pages = new_rel_pages;
	vacrelstats->pages_removed = old_rel_pages - new_rel_pages;

845 846 847 848
	ereport(elevel,
			(errmsg("\"%s\": truncated %u to %u pages",
					RelationGetRelationName(onerel),
					old_rel_pages, new_rel_pages),
849 850
			 errdetail("%s.",
					   pg_rusage_show(&ru0))));
851 852 853
}

/*
854
 * Rescan end pages to verify that they are (still) empty of tuples.
855 856 857 858
 *
 * Returns number of nondeletable pages (last nonempty page + 1).
 */
static BlockNumber
859
count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
860 861 862 863 864 865 866 867 868 869 870
{
	BlockNumber blkno;

	/* Strange coding of loop control is needed because blkno is unsigned */
	blkno = vacrelstats->rel_pages;
	while (blkno > vacrelstats->nonempty_pages)
	{
		Buffer		buf;
		Page		page;
		OffsetNumber offnum,
					maxoff;
871
		bool		hastup;
872

873 874
		/*
		 * We don't insert a vacuum delay point here, because we have an
B
Bruce Momjian 已提交
875 876
		 * exclusive lock on the table which we want to hold for as short a
		 * time as possible.  We still need to check for interrupts however.
877
		 */
878
		CHECK_FOR_INTERRUPTS();
J
Jan Wieck 已提交
879

880 881
		blkno--;

882 883
		buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
								 RBM_NORMAL, vac_strategy);
884 885 886 887 888 889 890 891

		/* In this phase we only need shared access to the buffer */
		LockBuffer(buf, BUFFER_LOCK_SHARE);

		page = BufferGetPage(buf);

		if (PageIsNew(page) || PageIsEmpty(page))
		{
892
			/* PageIsNew probably shouldn't happen... */
893
			UnlockReleaseBuffer(buf);
894 895 896 897 898 899 900 901 902 903 904 905 906
			continue;
		}

		hastup = false;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			ItemId		itemid;

			itemid = PageGetItemId(page, offnum);

907 908 909 910 911 912 913
			/*
			 * Note: any non-unused item should be taken as a reason to keep
			 * this page.  We formerly thought that DEAD tuples could be
			 * thrown away, but that's not so, because we'd not have cleaned
			 * out their index entries.
			 */
			if (ItemIdIsUsed(itemid))
914 915 916 917
			{
				hastup = true;
				break;			/* can stop scanning */
			}
918
		}						/* scan along page */
919

920
		UnlockReleaseBuffer(buf);
921 922 923 924 925 926 927 928

		/* Done scanning if we found a tuple here */
		if (hastup)
			return blkno + 1;
	}

	/*
	 * If we fall out of the loop, all the previously-thought-to-be-empty
929
	 * pages still are; we need not bother to look at the last known-nonempty
B
Bruce Momjian 已提交
930
	 * page.
931 932 933 934 935 936 937 938 939 940
	 */
	return vacrelstats->nonempty_pages;
}

/*
 * lazy_space_alloc - space allocation decisions for lazy vacuum
 *
 * See the comments at the head of this file for rationale.
 */
static void
941
lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
942
{
943
	long		maxtuples;
944

945 946
	if (vacrelstats->hasindex)
	{
947 948 949
		maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
		maxtuples = Min(maxtuples, INT_MAX);
		maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
950 951 952 953 954

		/* curious coding here to ensure the multiplication can't overflow */
		if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
			maxtuples = relblocks * LAZY_ALLOC_TUPLES;

955 956
		/* stay sane if small maintenance_work_mem */
		maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
957 958 959
	}
	else
	{
960 961
		maxtuples = MaxHeapTuplesPerPage;
	}
962 963

	vacrelstats->num_dead_tuples = 0;
964
	vacrelstats->max_dead_tuples = (int) maxtuples;
965 966 967 968 969 970 971 972 973 974 975 976
	vacrelstats->dead_tuples = (ItemPointer)
		palloc(maxtuples * sizeof(ItemPointerData));
}

/*
 * lazy_record_dead_tuple - remember one deletable tuple
 */
static void
lazy_record_dead_tuple(LVRelStats *vacrelstats,
					   ItemPointer itemptr)
{
	/*
977
	 * The array shouldn't overflow under normal behavior, but perhaps it
978
	 * could if we are given a really small maintenance_work_mem. In that
979
	 * case, just forget the last few tuples (we'll get 'em next time).
980 981 982 983 984 985 986 987 988 989 990
	 */
	if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
	{
		vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
		vacrelstats->num_dead_tuples++;
	}
}

/*
 *	lazy_tid_reaped() -- is a particular tid deletable?
 *
991 992
 *		This has the right signature to be an IndexBulkDeleteCallback.
 *
993 994 995
 *		Assumes dead_tuples array is in sorted order.
 */
static bool
996
lazy_tid_reaped(ItemPointer itemptr, void *state)
997
{
998
	LVRelStats *vacrelstats = (LVRelStats *) state;
999
	ItemPointer res;
1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038

	res = (ItemPointer) bsearch((void *) itemptr,
								(void *) vacrelstats->dead_tuples,
								vacrelstats->num_dead_tuples,
								sizeof(ItemPointerData),
								vac_cmp_itemptr);

	return (res != NULL);
}

/*
 * Comparator routines for use with qsort() and bsearch().
 */
static int
vac_cmp_itemptr(const void *left, const void *right)
{
	BlockNumber lblk,
				rblk;
	OffsetNumber loff,
				roff;

	lblk = ItemPointerGetBlockNumber((ItemPointer) left);
	rblk = ItemPointerGetBlockNumber((ItemPointer) right);

	if (lblk < rblk)
		return -1;
	if (lblk > rblk)
		return 1;

	loff = ItemPointerGetOffsetNumber((ItemPointer) left);
	roff = ItemPointerGetOffsetNumber((ItemPointer) right);

	if (loff < roff)
		return -1;
	if (loff > roff)
		return 1;

	return 0;
}