vacuumlazy.c 35.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*-------------------------------------------------------------------------
 *
 * vacuumlazy.c
 *	  Concurrent ("lazy") vacuuming.
 *
 *
 * The major space usage for LAZY VACUUM is storage for the array of dead
 * tuple TIDs, with the next biggest need being storage for per-disk-page
 * free space info.  We want to ensure we can vacuum even the very largest
 * relations with finite memory space usage.  To do that, we set upper bounds
 * on the number of tuples and pages we will keep track of at once.
 *
13 14 15 16 17
 * We are willing to use at most maintenance_work_mem memory space to keep
 * track of dead tuples.  We initially allocate an array of TIDs of that size.
 * If the array threatens to overflow, we suspend the heap scan phase and
 * perform a pass of index cleanup and page compaction, then resume the heap
 * scan with an empty TID array.
18 19 20
 *
 * We can limit the storage for page free space to MaxFSMPages entries,
 * since that's the most the free space map will be willing to remember
21 22
 * anyway.	If the relation has fewer than that many pages with free space,
 * life is easy: just build an array of per-page info.	If it has more,
23 24 25 26 27
 * we store the free space info as a heap ordered by amount of free space,
 * so that we can discard the pages with least free space to ensure we never
 * have more than MaxFSMPages entries in all.  The surviving page entries
 * are passed to the free space map at conclusion of the scan.
 *
28 29 30 31 32
 * If we're processing a table with no indexes, we can just vacuum each page
 * as we go; there's no need to save up multiple tuples to minimize the number
 * of index scans performed.  So we don't use maintenance_work_mem memory for
 * the TID array, just enough to hold as many heap tuples as fit on one page.
 *
33
 *
34
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
35 36 37 38
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
39
 *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.92 2007/09/10 17:58:45 alvherre Exp $
40 41 42 43 44
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

45 46
#include <math.h>

47 48
#include "access/genam.h"
#include "access/heapam.h"
49
#include "access/transam.h"
50
#include "commands/dbcommands.h"
51 52
#include "commands/vacuum.h"
#include "miscadmin.h"
53
#include "pgstat.h"
54
#include "postmaster/autovacuum.h"
55
#include "storage/freespace.h"
56
#include "utils/lsyscache.h"
57
#include "utils/memutils.h"
58
#include "utils/pg_rusage.h"
59 60 61 62 63 64


/*
 * Space/time tradeoff parameters: do these need to be user-tunable?
 *
 * To consider truncating the relation, we want there to be at least
65 66
 * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
 * is less) potentially-freeable pages.
67
 */
68
#define REL_TRUNCATE_MINIMUM	1000
69 70 71 72 73
#define REL_TRUNCATE_FRACTION	16


typedef struct LVRelStats
{
74 75
	/* hasindex = true means two-pass strategy; false means one-pass */
	bool		hasindex;
76
	/* Overall statistics about rel */
77
	BlockNumber rel_pages;
78
	double		rel_tuples;
B
Bruce Momjian 已提交
79
	BlockNumber pages_removed;
80
	double		tuples_deleted;
81
	BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
82
	Size		threshold;		/* minimum interesting free space */
83 84
	/* List of TIDs of tuples we intend to delete */
	/* NB: this list is ordered by TID address */
85 86
	int			num_dead_tuples;	/* current # of entries */
	int			max_dead_tuples;	/* # slots allocated in array */
87
	ItemPointer dead_tuples;	/* array of ItemPointerData */
88 89
	/* Array or heap of per-page info about free space */
	/* We use a simple array until it fills up, then convert to heap */
90 91
	bool		fs_is_heap;		/* are we using heap organization? */
	int			num_free_pages; /* current # of entries */
92
	int			max_free_pages; /* # slots allocated in array */
B
Bruce Momjian 已提交
93
	PageFreeSpaceInfo *free_pages;		/* array or heap of blkno/avail */
B
Bruce Momjian 已提交
94
	BlockNumber tot_free_pages; /* total pages with >= threshold space */
95
	int			num_index_scans;
96 97 98
} LVRelStats;


99
/* A few variables that don't seem worth passing around as parameters */
B
Bruce Momjian 已提交
100
static int	elevel = -1;
101

102 103 104
static TransactionId OldestXmin;
static TransactionId FreezeLimit;

105 106
static BufferAccessStrategy vac_strategy;

107 108 109

/* non-export function prototypes */
static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
110
			   Relation *Irel, int nindexes);
111
static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
112
static void lazy_vacuum_index(Relation indrel,
B
Bruce Momjian 已提交
113 114
				  IndexBulkDeleteResult **stats,
				  LVRelStats *vacrelstats);
115
static void lazy_cleanup_index(Relation indrel,
B
Bruce Momjian 已提交
116 117
				   IndexBulkDeleteResult *stats,
				   LVRelStats *vacrelstats);
118 119
static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
				 int tupindex, LVRelStats *vacrelstats);
120
static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
121
static BlockNumber count_nondeletable_pages(Relation onerel,
122
						 LVRelStats *vacrelstats);
123
static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
124
static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
125
					   ItemPointer itemptr);
126
static void lazy_record_free_space(LVRelStats *vacrelstats,
127
					   BlockNumber page, Size avail);
128
static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
129 130
static void lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats);
static int	vac_cmp_itemptr(const void *left, const void *right);
131
static int	vac_cmp_page_spaces(const void *left, const void *right);
132 133 134 135 136 137


/*
 *	lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
 *
 *		This routine vacuums a single heap, cleans out its indexes, and
138
 *		updates its relpages and reltuples statistics.
139 140 141 142 143
 *
 *		At entry, we have already established a transaction and opened
 *		and locked the relation.
 */
void
144 145
lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
				BufferAccessStrategy bstrategy)
146 147 148 149
{
	LVRelStats *vacrelstats;
	Relation   *Irel;
	int			nindexes;
150
	BlockNumber possibly_freeable;
151 152 153 154 155 156 157 158
	PGRUsage	ru0;
	TimestampTz	starttime = 0;

	pg_rusage_init(&ru0);

	/* measure elapsed time iff autovacuum logging requires it */
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum > 0)
		starttime = GetCurrentTimestamp();
159 160

	if (vacstmt->verbose)
161
		elevel = INFO;
162
	else
163
		elevel = DEBUG2;
B
Bruce Momjian 已提交
164

165 166
	vac_strategy = bstrategy;

167
	vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
168
						  &OldestXmin, &FreezeLimit);
169

170
	vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
171

172 173 174 175
	/* Set threshold for interesting free space = average request size */
	/* XXX should we scale it up or down?  Adjust vacuum.c too, if so */
	vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node);

176 177
	vacrelstats->num_index_scans = 0;

178
	/* Open all indexes of the relation */
179
	vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
180
	vacrelstats->hasindex = (nindexes > 0);
181 182

	/* Do the vacuuming */
183
	lazy_scan_heap(onerel, vacrelstats, Irel, nindexes);
184 185

	/* Done with indexes */
186
	vac_close_indexes(nindexes, Irel, NoLock);
187 188 189 190

	/*
	 * Optionally truncate the relation.
	 *
191 192
	 * Don't even think about it unless we have a shot at releasing a goodly
	 * number of pages.  Otherwise, the time taken isn't worth it.
193 194
	 */
	possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
195
	if (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
B
Bruce Momjian 已提交
196
		possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)
197
		lazy_truncate_heap(onerel, vacrelstats);
198 199 200 201

	/* Update shared free space map with final free space info */
	lazy_update_fsm(onerel, vacrelstats);

202 203 204 205 206
	if (vacrelstats->tot_free_pages > MaxFSMPages)
		ereport(WARNING,
				(errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space",
						get_namespace_name(RelationGetNamespace(onerel)),
						RelationGetRelationName(onerel)),
207 208 209 210
				 errhint((vacrelstats->tot_free_pages > vacrelstats->rel_pages * 0.20 ?
							/* Only suggest VACUUM FULL if 20% free */
							"Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\"." :
							"Consider increasing the configuration parameter \"max_fsm_pages\"."))));
211

212
	/* Update statistics in pg_class */
213 214 215
	vac_update_relstats(RelationGetRelid(onerel),
						vacrelstats->rel_pages,
						vacrelstats->rel_tuples,
216
						vacrelstats->hasindex,
217
						FreezeLimit);
218 219

	/* report results to the stats collector, too */
220
	pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
B
Bruce Momjian 已提交
221
						 vacstmt->analyze, vacrelstats->rel_tuples);
222 223 224 225

	/* and log the action if appropriate */
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum >= 0)
	{
226 227 228
		if (Log_autovacuum == 0 ||
			TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(),
									   Log_autovacuum))
229 230 231 232 233 234 235 236 237 238 239 240 241
			ereport(LOG,
					(errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
							"pages: %d removed, %d remain\n"
							"tuples: %.0f removed, %.0f remain\n"
							"system usage: %s",
							get_database_name(MyDatabaseId),
							get_namespace_name(RelationGetNamespace(onerel)),
							RelationGetRelationName(onerel),
							vacrelstats->num_index_scans,
							vacrelstats->pages_removed, vacrelstats->rel_pages,
							vacrelstats->tuples_deleted, vacrelstats->rel_tuples, 
							pg_rusage_show(&ru0))));
	}
242 243 244 245 246 247 248 249 250
}


/*
 *	lazy_scan_heap() -- scan an open heap relation
 *
 *		This routine sets commit status bits, builds lists of dead tuples
 *		and pages with free space, and calculates statistics on the number
 *		of live tuples in the heap.  When done, or when we run low on space
251
 *		for dead-tuple TIDs, invoke vacuuming of indexes and heap.
252
 *
253 254
 *		If there are no indexes then we just vacuum each dirty page as we
 *		process it, since there's no point in gathering many tuples.
255 256 257
 */
static void
lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
258
			   Relation *Irel, int nindexes)
259 260 261 262 263
{
	BlockNumber nblocks,
				blkno;
	HeapTupleData tuple;
	char	   *relname;
264 265
	BlockNumber empty_pages,
				vacuumed_pages;
266 267 268 269
	double		num_tuples,
				tups_vacuumed,
				nkeep,
				nunused;
270
	IndexBulkDeleteResult **indstats;
271
	int			i;
272
	PGRUsage	ru0;
273

274
	pg_rusage_init(&ru0);
275 276

	relname = RelationGetRelationName(onerel);
277 278 279 280
	ereport(elevel,
			(errmsg("vacuuming \"%s.%s\"",
					get_namespace_name(RelationGetNamespace(onerel)),
					relname)));
281

282
	empty_pages = vacuumed_pages = 0;
283 284
	num_tuples = tups_vacuumed = nkeep = nunused = 0;

285 286
	indstats = (IndexBulkDeleteResult **)
		palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
287

288 289 290 291
	nblocks = RelationGetNumberOfBlocks(onerel);
	vacrelstats->rel_pages = nblocks;
	vacrelstats->nonempty_pages = 0;

292
	lazy_space_alloc(vacrelstats, nblocks);
293 294 295 296 297 298 299

	for (blkno = 0; blkno < nblocks; blkno++)
	{
		Buffer		buf;
		Page		page;
		OffsetNumber offnum,
					maxoff;
300
		bool		tupgone,
301 302
					hastup;
		int			prev_dead_count;
303 304
		OffsetNumber frozen[MaxOffsetNumber];
		int			nfrozen;
305

306
		vacuum_delay_point();
J
Jan Wieck 已提交
307

308
		/*
B
Bruce Momjian 已提交
309 310
		 * If we are close to overrunning the available space for dead-tuple
		 * TIDs, pause and do a cycle of vacuuming before we tackle this page.
311
		 */
312
		if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
313 314 315 316
			vacrelstats->num_dead_tuples > 0)
		{
			/* Remove index entries */
			for (i = 0; i < nindexes; i++)
317
				lazy_vacuum_index(Irel[i],
318
								  &indstats[i],
319
								  vacrelstats);
320 321 322 323
			/* Remove tuples from heap */
			lazy_vacuum_heap(onerel, vacrelstats);
			/* Forget the now-vacuumed tuples, and press on */
			vacrelstats->num_dead_tuples = 0;
324
			vacrelstats->num_index_scans++;
325 326
		}

327
		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
328

329
		/* Initially, we only need shared access to the buffer */
330
		LockBuffer(buf, BUFFER_LOCK_SHARE);
331 332 333 334 335

		page = BufferGetPage(buf);

		if (PageIsNew(page))
		{
336
			/*
B
Bruce Momjian 已提交
337 338 339
			 * An all-zeroes page could be left over if a backend extends the
			 * relation but crashes before initializing the page. Reclaim such
			 * pages for use.
340
			 *
341 342 343
			 * We have to be careful here because we could be looking at a
			 * page that someone has just added to the relation and not yet
			 * been able to initialize (see RelationGetBufferForTuple). To
B
Bruce Momjian 已提交
344 345 346 347 348
			 * interlock against that, release the buffer read lock (which we
			 * must do anyway) and grab the relation extension lock before
			 * re-locking in exclusive mode.  If the page is still
			 * uninitialized by then, it must be left over from a crashed
			 * backend, and we can initialize it.
349
			 *
350 351 352
			 * We don't really need the relation lock when this is a new or
			 * temp relation, but it's probably not worth the code space to
			 * check that, since this surely isn't a critical path.
353
			 *
354 355
			 * Note: the comparable code in vacuum.c need not worry because
			 * it's got exclusive lock on the whole relation.
356
			 */
357
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
358 359
			LockRelationForExtension(onerel, ExclusiveLock);
			UnlockRelationForExtension(onerel, ExclusiveLock);
360 361 362
			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
			if (PageIsNew(page))
			{
363
				ereport(WARNING,
B
Bruce Momjian 已提交
364 365
				(errmsg("relation \"%s\" page %u is uninitialized --- fixing",
						relname, blkno)));
366
				PageInit(page, BufferGetPageSize(buf), 0);
367
				empty_pages++;
368 369 370
				lazy_record_free_space(vacrelstats, blkno,
									   PageGetFreeSpace(page));
			}
371 372
			MarkBufferDirty(buf);
			UnlockReleaseBuffer(buf);
373 374 375 376 377 378 379 380
			continue;
		}

		if (PageIsEmpty(page))
		{
			empty_pages++;
			lazy_record_free_space(vacrelstats, blkno,
								   PageGetFreeSpace(page));
381
			UnlockReleaseBuffer(buf);
382 383 384
			continue;
		}

385
		nfrozen = 0;
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
		hastup = false;
		prev_dead_count = vacrelstats->num_dead_tuples;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			ItemId		itemid;

			itemid = PageGetItemId(page, offnum);

			if (!ItemIdIsUsed(itemid))
			{
				nunused += 1;
				continue;
			}

			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);

			tupgone = false;

409
			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
410 411
			{
				case HEAPTUPLE_DEAD:
412
					tupgone = true;		/* we can delete the tuple */
413 414
					break;
				case HEAPTUPLE_LIVE:
415
					/* Tuple is good --- but let's do some validity checks */
416 417 418 419
					if (onerel->rd_rel->relhasoids &&
						!OidIsValid(HeapTupleGetOid(&tuple)))
						elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
							 relname, blkno, offnum);
420 421
					break;
				case HEAPTUPLE_RECENTLY_DEAD:
422

423
					/*
B
Bruce Momjian 已提交
424 425
					 * If tuple is recently deleted then we must not remove it
					 * from relation.
426 427 428 429 430 431 432 433 434 435
					 */
					nkeep += 1;
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				case HEAPTUPLE_DELETE_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				default:
436
					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
437 438 439 440 441 442 443 444 445 446 447 448
					break;
			}

			if (tupgone)
			{
				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
				tups_vacuumed += 1;
			}
			else
			{
				num_tuples += 1;
				hastup = true;
449

B
Bruce Momjian 已提交
450
				/*
451 452 453
				 * Each non-removable tuple must be checked to see if it
				 * needs freezing.  If we already froze anything, then
				 * we've already switched the buffer lock to exclusive.
454
				 */
455 456 457
				if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
									  (nfrozen > 0) ? InvalidBuffer : buf))
					frozen[nfrozen++] = offnum;
458
			}
459
		}						/* scan along page */
460

461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
		/*
		 * If we froze any tuples, mark the buffer dirty, and write a WAL
		 * record recording the changes.  We must log the changes to be
		 * crash-safe against future truncation of CLOG.
		 */
		if (nfrozen > 0)
		{
			MarkBufferDirty(buf);
			/* no XLOG for temp tables, though */
			if (!onerel->rd_istemp)
			{
				XLogRecPtr	recptr;

				recptr = log_heap_freeze(onerel, buf, FreezeLimit,
										 frozen, nfrozen);
				PageSetLSN(page, recptr);
				PageSetTLI(page, ThisTimeLineID);
			}
		}

481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
		/*
		 * If there are no indexes then we can vacuum the page right now
		 * instead of doing a second scan.
		 */
		if (nindexes == 0 &&
			vacrelstats->num_dead_tuples > 0)
		{
			/* Trade in buffer share lock for super-exclusive lock */
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
			LockBufferForCleanup(buf);
			/* Remove tuples from heap */
			lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
			/* Forget the now-vacuumed tuples, and press on */
			vacrelstats->num_dead_tuples = 0;
			vacuumed_pages++;
		}

498
		/*
499
		 * If we remembered any tuples for deletion, then the page will be
B
Bruce Momjian 已提交
500 501
		 * visited again by lazy_vacuum_heap, which will compute and record
		 * its post-compaction free space.	If not, then we're done with this
B
Bruce Momjian 已提交
502 503
		 * page, so remember its free space as-is.	(This path will always be
		 * taken if there are no indexes.)
504 505 506 507 508 509 510 511 512 513 514
		 */
		if (vacrelstats->num_dead_tuples == prev_dead_count)
		{
			lazy_record_free_space(vacrelstats, blkno,
								   PageGetFreeSpace(page));
		}

		/* Remember the location of the last page with nonremovable tuples */
		if (hastup)
			vacrelstats->nonempty_pages = blkno + 1;

515
		UnlockReleaseBuffer(buf);
516 517
	}

518 519
	/* save stats for use later */
	vacrelstats->rel_tuples = num_tuples;
520
	vacrelstats->tuples_deleted = tups_vacuumed;
521

522
	/* If any tuples need to be deleted, perform final vacuum cycle */
523
	/* XXX put a threshold on min number of tuples here? */
524 525 526 527
	if (vacrelstats->num_dead_tuples > 0)
	{
		/* Remove index entries */
		for (i = 0; i < nindexes; i++)
528
			lazy_vacuum_index(Irel[i],
529
							  &indstats[i],
530
							  vacrelstats);
531 532
		/* Remove tuples from heap */
		lazy_vacuum_heap(onerel, vacrelstats);
533
		vacrelstats->num_index_scans++;
534
	}
535 536 537 538

	/* Do post-vacuum cleanup and statistics update for each index */
	for (i = 0; i < nindexes; i++)
		lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
539

540 541 542 543 544 545 546
	/* If no indexes, make log report that lazy_vacuum_heap would've made */
	if (vacuumed_pages)
		ereport(elevel,
				(errmsg("\"%s\": removed %.0f row versions in %u pages",
						RelationGetRelationName(onerel),
						tups_vacuumed, vacuumed_pages)));

547
	ereport(elevel,
548
			(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
549 550
					RelationGetRelationName(onerel),
					tups_vacuumed, num_tuples, nblocks),
551
			 errdetail("%.0f dead row versions cannot be removed yet.\n"
552
					   "There were %.0f unused item pointers.\n"
553
					   "%u pages contain useful free space.\n"
554
					   "%u pages are entirely empty.\n"
555
					   "%s.",
556 557
					   nkeep,
					   nunused,
558
					   vacrelstats->tot_free_pages,
559
					   empty_pages,
560
					   pg_rusage_show(&ru0))));
561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579
}


/*
 *	lazy_vacuum_heap() -- second pass over the heap
 *
 *		This routine marks dead tuples as unused and compacts out free
 *		space on their pages.  Pages not having dead tuples recorded from
 *		lazy_scan_heap are not visited at all.
 *
 * Note: the reason for doing this as a second pass is we cannot remove
 * the tuples until we've removed their index entries, and we want to
 * process index entry removal in batches as large as possible.
 */
static void
lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
{
	int			tupindex;
	int			npages;
580
	PGRUsage	ru0;
581

582
	pg_rusage_init(&ru0);
583 584 585 586 587
	npages = 0;

	tupindex = 0;
	while (tupindex < vacrelstats->num_dead_tuples)
	{
588
		BlockNumber tblk;
589 590 591
		Buffer		buf;
		Page		page;

592
		vacuum_delay_point();
J
Jan Wieck 已提交
593

594
		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
595
		buf = ReadBufferWithStrategy(onerel, tblk, vac_strategy);
596 597 598 599 600 601
		LockBufferForCleanup(buf);
		tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
		/* Now that we've compacted the page, record its available space */
		page = BufferGetPage(buf);
		lazy_record_free_space(vacrelstats, tblk,
							   PageGetFreeSpace(page));
602
		UnlockReleaseBuffer(buf);
603 604 605
		npages++;
	}

606
	ereport(elevel,
607
			(errmsg("\"%s\": removed %d row versions in %d pages",
608 609
					RelationGetRelationName(onerel),
					tupindex, npages),
610 611
			 errdetail("%s.",
					   pg_rusage_show(&ru0))));
612 613 614 615 616 617
}

/*
 *	lazy_vacuum_page() -- free dead tuples on a page
 *					 and repair its fragmentation.
 *
618
 * Caller must hold pin and lock on the buffer.
619 620 621 622 623 624 625 626 627
 *
 * tupindex is the index in vacrelstats->dead_tuples of the first dead
 * tuple for this page.  We assume the rest follow sequentially.
 * The return value is the first tupindex after the tuples of this page.
 */
static int
lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
				 int tupindex, LVRelStats *vacrelstats)
{
628
	OffsetNumber unused[MaxOffsetNumber];
629 630 631 632 633
	int			uncnt;
	Page		page = BufferGetPage(buffer);
	ItemId		itemid;

	START_CRIT_SECTION();
634

635 636
	for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
	{
637 638
		BlockNumber tblk;
		OffsetNumber toff;
639 640 641 642 643 644 645 646 647 648 649

		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
		if (tblk != blkno)
			break;				/* past end of tuples for this block */
		toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
		itemid = PageGetItemId(page, toff);
		itemid->lp_flags &= ~LP_USED;
	}

	uncnt = PageRepairFragmentation(page, unused);

650 651
	MarkBufferDirty(buffer);

652 653
	/* XLOG stuff */
	if (!onerel->rd_istemp)
654 655 656
	{
		XLogRecPtr	recptr;

657
		recptr = log_heap_clean(onerel, buffer, unused, uncnt);
658
		PageSetLSN(page, recptr);
659
		PageSetTLI(page, ThisTimeLineID);
660
	}
661

662 663 664 665 666
	END_CRIT_SECTION();

	return tupindex;
}

667
/*
668
 *	lazy_vacuum_index() -- vacuum one index relation.
669
 *
670 671
 *		Delete all the index entries pointing to tuples listed in
 *		vacrelstats->dead_tuples, and update running statistics.
672 673
 */
static void
674 675 676
lazy_vacuum_index(Relation indrel,
				  IndexBulkDeleteResult **stats,
				  LVRelStats *vacrelstats)
677
{
678
	IndexVacuumInfo ivinfo;
679
	PGRUsage	ru0;
680

681
	pg_rusage_init(&ru0);
682

683 684 685 686 687
	ivinfo.index = indrel;
	ivinfo.vacuum_full = false;
	ivinfo.message_level = elevel;
	/* We don't yet know rel_tuples, so pass -1 */
	ivinfo.num_heap_tuples = -1;
688
	ivinfo.strategy = vac_strategy;
689

690 691 692
	/* Do bulk deletion */
	*stats = index_bulk_delete(&ivinfo, *stats,
							   lazy_tid_reaped, (void *) vacrelstats);
693

694
	ereport(elevel,
695
			(errmsg("scanned index \"%s\" to remove %d row versions",
B
Bruce Momjian 已提交
696
					RelationGetRelationName(indrel),
697 698
					vacrelstats->num_dead_tuples),
			 errdetail("%s.", pg_rusage_show(&ru0))));
699 700
}

701
/*
702
 *	lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
703 704
 */
static void
705 706 707
lazy_cleanup_index(Relation indrel,
				   IndexBulkDeleteResult *stats,
				   LVRelStats *vacrelstats)
708
{
709
	IndexVacuumInfo ivinfo;
710
	PGRUsage	ru0;
711

712
	pg_rusage_init(&ru0);
713

714 715 716 717
	ivinfo.index = indrel;
	ivinfo.vacuum_full = false;
	ivinfo.message_level = elevel;
	ivinfo.num_heap_tuples = vacrelstats->rel_tuples;
718
	ivinfo.strategy = vac_strategy;
719

720
	stats = index_vacuum_cleanup(&ivinfo, stats);
721 722 723 724

	if (!stats)
		return;

725
	/* now update statistics in pg_class */
726 727 728
	vac_update_relstats(RelationGetRelid(indrel),
						stats->num_pages,
						stats->num_index_tuples,
729
						false, InvalidTransactionId);
730

731
	ereport(elevel,
B
Bruce Momjian 已提交
732 733 734 735 736 737 738 739 740 741
			(errmsg("index \"%s\" now contains %.0f row versions in %u pages",
					RelationGetRelationName(indrel),
					stats->num_index_tuples,
					stats->num_pages),
			 errdetail("%.0f index row versions were removed.\n"
			 "%u index pages have been deleted, %u are currently reusable.\n"
					   "%s.",
					   stats->tuples_removed,
					   stats->pages_deleted, stats->pages_free,
					   pg_rusage_show(&ru0))));
742

743
	pfree(stats);
744 745 746 747 748 749
}

/*
 * lazy_truncate_heap - try to truncate off any empty pages at the end
 */
static void
750
lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
751
{
752 753
	BlockNumber old_rel_pages = vacrelstats->rel_pages;
	BlockNumber new_rel_pages;
754
	PageFreeSpaceInfo *pageSpaces;
755 756 757
	int			n;
	int			i,
				j;
758
	PGRUsage	ru0;
759

760
	pg_rusage_init(&ru0);
761 762

	/*
B
Bruce Momjian 已提交
763 764 765 766
	 * We need full exclusive lock on the relation in order to do truncation.
	 * If we can't get it, give up rather than waiting --- we don't want to
	 * block other backends, and we don't want to deadlock (which is quite
	 * possible considering we already hold a lower-grade lock).
767
	 */
768
	if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
769 770 771 772
		return;

	/*
	 * Now that we have exclusive lock, look to see if the rel has grown
B
Bruce Momjian 已提交
773 774
	 * whilst we were vacuuming with non-exclusive lock.  If so, give up; the
	 * newly added pages presumably contain non-deletable tuples.
775 776 777 778 779 780 781 782 783 784 785 786
	 */
	new_rel_pages = RelationGetNumberOfBlocks(onerel);
	if (new_rel_pages != old_rel_pages)
	{
		/* might as well use the latest news when we update pg_class stats */
		vacrelstats->rel_pages = new_rel_pages;
		UnlockRelation(onerel, AccessExclusiveLock);
		return;
	}

	/*
	 * Scan backwards from the end to verify that the end pages actually
B
Bruce Momjian 已提交
787 788 789
	 * contain nothing we need to keep.  This is *necessary*, not optional,
	 * because other backends could have added tuples to these pages whilst we
	 * were vacuuming.
790
	 */
791
	new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
792 793 794 795 796 797 798 799 800 801 802

	if (new_rel_pages >= old_rel_pages)
	{
		/* can't do anything after all */
		UnlockRelation(onerel, AccessExclusiveLock);
		return;
	}

	/*
	 * Okay to truncate.
	 */
803
	RelationTruncate(onerel, new_rel_pages);
804 805 806 807 808

	/*
	 * Drop free-space info for removed blocks; these must not get entered
	 * into the FSM!
	 */
809
	pageSpaces = vacrelstats->free_pages;
810 811 812 813
	n = vacrelstats->num_free_pages;
	j = 0;
	for (i = 0; i < n; i++)
	{
814
		if (pageSpaces[i].blkno < new_rel_pages)
815
		{
816
			pageSpaces[j] = pageSpaces[i];
817 818 819 820
			j++;
		}
	}
	vacrelstats->num_free_pages = j;
B
Bruce Momjian 已提交
821

822 823 824
	/*
	 * If tot_free_pages was more than num_free_pages, we can't tell for sure
	 * what its correct value is now, because we don't know which of the
B
Bruce Momjian 已提交
825 826
	 * forgotten pages are getting truncated.  Conservatively set it equal to
	 * num_free_pages.
827 828 829
	 */
	vacrelstats->tot_free_pages = j;

830 831
	/* We destroyed the heap ordering, so mark array unordered */
	vacrelstats->fs_is_heap = false;
832

833 834 835 836
	/* update statistics */
	vacrelstats->rel_pages = new_rel_pages;
	vacrelstats->pages_removed = old_rel_pages - new_rel_pages;

837 838 839 840
	/*
	 * We keep the exclusive lock until commit (perhaps not necessary)?
	 */

841 842 843 844
	ereport(elevel,
			(errmsg("\"%s\": truncated %u to %u pages",
					RelationGetRelationName(onerel),
					old_rel_pages, new_rel_pages),
845 846
			 errdetail("%s.",
					   pg_rusage_show(&ru0))));
847 848 849 850 851 852 853 854
}

/*
 * Rescan end pages to verify that they are (still) empty of needed tuples.
 *
 * Returns number of nondeletable pages (last nonempty page + 1).
 */
static BlockNumber
855
count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
856 857 858 859 860 861 862 863 864 865 866 867
{
	BlockNumber blkno;
	HeapTupleData tuple;

	/* Strange coding of loop control is needed because blkno is unsigned */
	blkno = vacrelstats->rel_pages;
	while (blkno > vacrelstats->nonempty_pages)
	{
		Buffer		buf;
		Page		page;
		OffsetNumber offnum,
					maxoff;
868
		bool		tupgone,
869 870
					hastup;

871 872 873 874 875
		/*
		 * We don't insert a vacuum delay point here, because we have an
		 * exclusive lock on the table which we want to hold for as short
		 * a time as possible.
		 */
J
Jan Wieck 已提交
876

877 878
		blkno--;

879
		buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy);
880 881 882 883 884 885 886 887

		/* In this phase we only need shared access to the buffer */
		LockBuffer(buf, BUFFER_LOCK_SHARE);

		page = BufferGetPage(buf);

		if (PageIsNew(page) || PageIsEmpty(page))
		{
888
			/* PageIsNew probably shouldn't happen... */
889
			UnlockReleaseBuffer(buf);
890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911
			continue;
		}

		hastup = false;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			ItemId		itemid;

			itemid = PageGetItemId(page, offnum);

			if (!ItemIdIsUsed(itemid))
				continue;

			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);

			tupgone = false;

912
			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
913 914
			{
				case HEAPTUPLE_DEAD:
915
					tupgone = true;		/* we can delete the tuple */
916 917
					break;
				case HEAPTUPLE_LIVE:
918
					/* Shouldn't be necessary to re-freeze anything */
919 920
					break;
				case HEAPTUPLE_RECENTLY_DEAD:
921

922
					/*
B
Bruce Momjian 已提交
923 924
					 * If tuple is recently deleted then we must not remove it
					 * from relation.
925 926 927 928 929 930 931 932 933
					 */
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				case HEAPTUPLE_DELETE_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				default:
934
					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
935 936 937 938 939 940 941 942
					break;
			}

			if (!tupgone)
			{
				hastup = true;
				break;			/* can stop scanning */
			}
943
		}						/* scan along page */
944

945
		UnlockReleaseBuffer(buf);
946 947 948 949 950 951 952 953

		/* Done scanning if we found a tuple here */
		if (hastup)
			return blkno + 1;
	}

	/*
	 * If we fall out of the loop, all the previously-thought-to-be-empty
B
Bruce Momjian 已提交
954 955
	 * pages really are; we need not bother to look at the last known-nonempty
	 * page.
956 957 958 959 960 961 962 963 964 965
	 */
	return vacrelstats->nonempty_pages;
}

/*
 * lazy_space_alloc - space allocation decisions for lazy vacuum
 *
 * See the comments at the head of this file for rationale.
 */
static void
966
lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
967
{
968
	long		maxtuples;
969 970
	int			maxpages;

971 972
	if (vacrelstats->hasindex)
	{
973 974 975 976 977
		maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
		maxtuples = Min(maxtuples, INT_MAX);
		maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
		/* stay sane if small maintenance_work_mem */
		maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
978 979 980
	}
	else
	{
981 982
		maxtuples = MaxHeapTuplesPerPage;
	}
983 984

	vacrelstats->num_dead_tuples = 0;
985
	vacrelstats->max_dead_tuples = (int) maxtuples;
986 987 988 989
	vacrelstats->dead_tuples = (ItemPointer)
		palloc(maxtuples * sizeof(ItemPointerData));

	maxpages = MaxFSMPages;
990
	maxpages = Min(maxpages, MaxAllocSize / sizeof(PageFreeSpaceInfo));
991 992 993 994 995 996 997
	/* No need to allocate more pages than the relation has blocks */
	if (relblocks < (BlockNumber) maxpages)
		maxpages = (int) relblocks;

	vacrelstats->fs_is_heap = false;
	vacrelstats->num_free_pages = 0;
	vacrelstats->max_free_pages = maxpages;
998 999
	vacrelstats->free_pages = (PageFreeSpaceInfo *)
		palloc(maxpages * sizeof(PageFreeSpaceInfo));
1000
	vacrelstats->tot_free_pages = 0;
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
}

/*
 * lazy_record_dead_tuple - remember one deletable tuple
 */
static void
lazy_record_dead_tuple(LVRelStats *vacrelstats,
					   ItemPointer itemptr)
{
	/*
1011
	 * The array shouldn't overflow under normal behavior, but perhaps it
1012 1013
	 * could if we are given a really small maintenance_work_mem. In that
	 * case, just forget the last few tuples.
1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
	 */
	if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
	{
		vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
		vacrelstats->num_dead_tuples++;
	}
}

/*
 * lazy_record_free_space - remember free space on one page
 */
static void
lazy_record_free_space(LVRelStats *vacrelstats,
					   BlockNumber page,
					   Size avail)
{
1030
	PageFreeSpaceInfo *pageSpaces;
1031 1032
	int			n;

1033 1034 1035
	/*
	 * A page with less than stats->threshold free space will be forgotten
	 * immediately, and never passed to the free space map.  Removing the
B
Bruce Momjian 已提交
1036 1037 1038 1039
	 * uselessly small entries early saves cycles, and in particular reduces
	 * the amount of time we spend holding the FSM lock when we finally call
	 * RecordRelationFreeSpace.  Since the FSM will probably drop pages with
	 * little free space anyway, there's no point in making this really small.
1040
	 *
B
Bruce Momjian 已提交
1041 1042 1043 1044 1045
	 * XXX Is it worth trying to measure average tuple size, and using that to
	 * adjust the threshold?  Would be worthwhile if FSM has no stats yet for
	 * this relation.  But changing the threshold as we scan the rel might
	 * lead to bizarre behavior, too.  Also, it's probably better if vacuum.c
	 * has the same thresholding behavior as we do here.
1046 1047
	 */
	if (avail < vacrelstats->threshold)
1048 1049
		return;

1050 1051 1052
	/* Count all pages over threshold, even if not enough space in array */
	vacrelstats->tot_free_pages++;

1053
	/* Copy pointers to local variables for notational simplicity */
1054
	pageSpaces = vacrelstats->free_pages;
1055 1056 1057 1058 1059
	n = vacrelstats->max_free_pages;

	/* If we haven't filled the array yet, just keep adding entries */
	if (vacrelstats->num_free_pages < n)
	{
1060 1061
		pageSpaces[vacrelstats->num_free_pages].blkno = page;
		pageSpaces[vacrelstats->num_free_pages].avail = avail;
1062 1063 1064 1065 1066 1067 1068
		vacrelstats->num_free_pages++;
		return;
	}

	/*----------
	 * The rest of this routine works with "heap" organization of the
	 * free space arrays, wherein we maintain the heap property
B
Bruce Momjian 已提交
1069
	 *			avail[(j-1) div 2] <= avail[j]	for 0 < j < n.
1070 1071 1072 1073 1074 1075 1076 1077
	 * In particular, the zero'th element always has the smallest available
	 * space and can be discarded to make room for a new page with more space.
	 * See Knuth's discussion of heap-based priority queues, sec 5.2.3;
	 * but note he uses 1-origin array subscripts, not 0-origin.
	 *----------
	 */

	/* If we haven't yet converted the array to heap organization, do it */
1078
	if (!vacrelstats->fs_is_heap)
1079 1080 1081
	{
		/*
		 * Scan backwards through the array, "sift-up" each value into its
B
Bruce Momjian 已提交
1082 1083
		 * correct position.  We can start the scan at n/2-1 since each entry
		 * above that position has no children to worry about.
1084
		 */
1085
		int			l = n / 2;
1086 1087 1088

		while (--l >= 0)
		{
1089 1090
			BlockNumber R = pageSpaces[l].blkno;
			Size		K = pageSpaces[l].avail;
1091 1092 1093 1094 1095
			int			i;		/* i is where the "hole" is */

			i = l;
			for (;;)
			{
1096
				int			j = 2 * i + 1;
1097 1098 1099

				if (j >= n)
					break;
1100
				if (j + 1 < n && pageSpaces[j].avail > pageSpaces[j + 1].avail)
1101
					j++;
1102
				if (K <= pageSpaces[j].avail)
1103
					break;
1104
				pageSpaces[i] = pageSpaces[j];
1105 1106
				i = j;
			}
1107 1108
			pageSpaces[i].blkno = R;
			pageSpaces[i].avail = K;
1109 1110 1111 1112 1113 1114
		}

		vacrelstats->fs_is_heap = true;
	}

	/* If new page has more than zero'th entry, insert it into heap */
1115
	if (avail > pageSpaces[0].avail)
1116 1117
	{
		/*
1118
		 * Notionally, we replace the zero'th entry with the new data, and
B
Bruce Momjian 已提交
1119 1120 1121
		 * then sift-up to maintain the heap property.	Physically, the new
		 * data doesn't get stored into the arrays until we find the right
		 * location for it.
1122
		 */
1123
		int			i = 0;		/* i is where the "hole" is */
1124 1125 1126

		for (;;)
		{
1127
			int			j = 2 * i + 1;
1128 1129 1130

			if (j >= n)
				break;
1131
			if (j + 1 < n && pageSpaces[j].avail > pageSpaces[j + 1].avail)
1132
				j++;
1133
			if (avail <= pageSpaces[j].avail)
1134
				break;
1135
			pageSpaces[i] = pageSpaces[j];
1136 1137
			i = j;
		}
1138 1139
		pageSpaces[i].blkno = page;
		pageSpaces[i].avail = avail;
1140 1141 1142 1143 1144 1145
	}
}

/*
 *	lazy_tid_reaped() -- is a particular tid deletable?
 *
1146 1147
 *		This has the right signature to be an IndexBulkDeleteCallback.
 *
1148 1149 1150
 *		Assumes dead_tuples array is in sorted order.
 */
static bool
1151
lazy_tid_reaped(ItemPointer itemptr, void *state)
1152
{
1153
	LVRelStats *vacrelstats = (LVRelStats *) state;
1154
	ItemPointer res;
1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171

	res = (ItemPointer) bsearch((void *) itemptr,
								(void *) vacrelstats->dead_tuples,
								vacrelstats->num_dead_tuples,
								sizeof(ItemPointerData),
								vac_cmp_itemptr);

	return (res != NULL);
}

/*
 * Update the shared Free Space Map with the info we now have about
 * free space in the relation, discarding any old info the map may have.
 */
static void
lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats)
{
1172 1173 1174
	PageFreeSpaceInfo *pageSpaces = vacrelstats->free_pages;
	int			nPages = vacrelstats->num_free_pages;

1175
	/*
1176
	 * Sort data into order, as required by RecordRelationFreeSpace.
1177
	 */
1178 1179 1180 1181
	if (nPages > 1)
		qsort(pageSpaces, nPages, sizeof(PageFreeSpaceInfo),
			  vac_cmp_page_spaces);

1182 1183
	RecordRelationFreeSpace(&onerel->rd_node, vacrelstats->tot_free_pages,
							nPages, pageSpaces);
1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214
}

/*
 * Comparator routines for use with qsort() and bsearch().
 */
static int
vac_cmp_itemptr(const void *left, const void *right)
{
	BlockNumber lblk,
				rblk;
	OffsetNumber loff,
				roff;

	lblk = ItemPointerGetBlockNumber((ItemPointer) left);
	rblk = ItemPointerGetBlockNumber((ItemPointer) right);

	if (lblk < rblk)
		return -1;
	if (lblk > rblk)
		return 1;

	loff = ItemPointerGetOffsetNumber((ItemPointer) left);
	roff = ItemPointerGetOffsetNumber((ItemPointer) right);

	if (loff < roff)
		return -1;
	if (loff > roff)
		return 1;

	return 0;
}
1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227

static int
vac_cmp_page_spaces(const void *left, const void *right)
{
	PageFreeSpaceInfo *linfo = (PageFreeSpaceInfo *) left;
	PageFreeSpaceInfo *rinfo = (PageFreeSpaceInfo *) right;

	if (linfo->blkno < rinfo->blkno)
		return -1;
	else if (linfo->blkno > rinfo->blkno)
		return 1;
	return 0;
}