vacuum.c 69.2 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * vacuum.c
4
 *	  the postgres vacuum cleaner
5
 *
B
Add:  
Bruce Momjian 已提交
6 7
 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
 * Portions Copyright (c) 1994, Regents of the University of California
8 9 10
 *
 *
 * IDENTIFICATION
V
Vadim B. Mikheev 已提交
11
 *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.174 2000/11/30 08:46:22 vadim Exp $
12
 *
B
Bruce Momjian 已提交
13

14 15
 *-------------------------------------------------------------------------
 */
16
#include <sys/types.h>
17
#include <sys/file.h>
18 19 20
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
21

B
Bruce Momjian 已提交
22 23 24 25 26 27 28 29
#include "postgres.h"

#include "access/genam.h"
#include "access/heapam.h"
#include "catalog/catalog.h"
#include "catalog/catname.h"
#include "catalog/index.h"
#include "commands/vacuum.h"
B
Bruce Momjian 已提交
30
#include "miscadmin.h"
31
#include "nodes/execnodes.h"
32
#include "storage/sinval.h"
B
Bruce Momjian 已提交
33
#include "storage/smgr.h"
34
#include "tcop/tcopprot.h"
35
#include "utils/acl.h"
B
Bruce Momjian 已提交
36
#include "utils/builtins.h"
37
#include "utils/fmgroids.h"
B
Bruce Momjian 已提交
38
#include "utils/inval.h"
39
#include "utils/relcache.h"
B
Bruce Momjian 已提交
40
#include "utils/syscache.h"
B
Bruce Momjian 已提交
41
#include "utils/temprel.h"
B
Bruce Momjian 已提交
42

43
#ifndef HAVE_GETRUSAGE
B
Bruce Momjian 已提交
44
#include "rusagestub.h"
45 46 47 48
#else
#include <sys/time.h>
#include <sys/resource.h>
#endif
49

V
Vadim B. Mikheev 已提交
50
#include "access/xlog.h"
V
Vadim B. Mikheev 已提交
51 52
extern XLogRecPtr	log_heap_move(Relation reln, 
						ItemPointerData from, HeapTuple newtup);
53

54
static MemoryContext vac_context = NULL;
55

56
static int	MESSAGE_LEVEL;		/* message level */
V
Vadim B. Mikheev 已提交
57

B
Bruce Momjian 已提交
58
static TransactionId XmaxRecent;
59

60
/* non-export function prototypes */
B
Bruce Momjian 已提交
61 62
static void vacuum_init(void);
static void vacuum_shutdown(void);
B
Bruce Momjian 已提交
63
static void vac_vacuum(NameData *VacRelP, bool analyze, List *anal_cols2);
B
Bruce Momjian 已提交
64
static VRelList getrels(NameData *VacRelP);
65
static void vacuum_rel(Oid relid, bool analyze, bool is_toastrel);
B
Bruce Momjian 已提交
66 67 68 69 70
static void scan_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages, VacPageList fraged_pages);
static void repair_frag(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages, VacPageList fraged_pages, int nindices, Relation *Irel);
static void vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacpagelist);
static void vacuum_page(Page page, VacPage vacpage);
static void vacuum_index(VacPageList vacpagelist, Relation indrel, int num_tuples, int keep_tuples);
B
Bruce Momjian 已提交
71
static void scan_index(Relation indrel, int num_tuples);
72
static void update_relstats(Oid relid, int num_pages, int num_tuples, bool hasindex, VRelStats *vacrelstats);
B
Bruce Momjian 已提交
73 74 75
static VacPage tid_reaped(ItemPointer itemptr, VacPageList vacpagelist);
static void reap_page(VacPageList vacpagelist, VacPage vacpage);
static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
76
static void get_indices(Relation relation, int *nindices, Relation **Irel);
B
Bruce Momjian 已提交
77
static void close_indices(int nindices, Relation *Irel);
78 79
static IndexInfo **get_index_desc(Relation onerel, int nindices,
								  Relation *Irel);
B
Bruce Momjian 已提交
80
static void *vac_find_eq(void *bot, int nelem, int size, void *elm,
B
Bruce Momjian 已提交
81
			 int (*compar) (const void *, const void *));
B
Bruce Momjian 已提交
82 83 84
static int	vac_cmp_blk(const void *left, const void *right);
static int	vac_cmp_offno(const void *left, const void *right);
static int	vac_cmp_vtlinks(const void *left, const void *right);
B
Bruce Momjian 已提交
85
static bool enough_space(VacPage vacpage, Size len);
B
Bruce Momjian 已提交
86
static char *show_rusage(struct rusage * ru0);
87

88

89
void
B
Bruce Momjian 已提交
90
vacuum(char *vacrel, bool verbose, bool analyze, List *anal_cols)
91
{
92
	NameData	VacRel;
93
	Name		VacRelName;
94
	MemoryContext old;
95
	List	   *le;
B
Bruce Momjian 已提交
96
	List	   *anal_cols2 = NIL;
97

B
Bruce Momjian 已提交
98
	if (anal_cols != NIL && !analyze)
99 100
		elog(ERROR, "Can't vacuum columns, only tables.  You can 'vacuum analyze' columns.");

101
	/*
102
	 * We cannot run VACUUM inside a user transaction block; if we were
B
Bruce Momjian 已提交
103 104 105 106 107 108
	 * inside a transaction, then our commit- and
	 * start-transaction-command calls would not have the intended effect!
	 * Furthermore, the forced commit that occurs before truncating the
	 * relation's file would have the effect of committing the rest of the
	 * user's transaction too, which would certainly not be the desired
	 * behavior.
109
	 */
110 111 112
	if (IsTransactionBlock())
		elog(ERROR, "VACUUM cannot run inside a BEGIN/END block");

113 114 115 116 117
	if (verbose)
		MESSAGE_LEVEL = NOTICE;
	else
		MESSAGE_LEVEL = DEBUG;

118 119 120 121 122 123 124 125 126 127 128 129
	/*
	 * Create special memory context for cross-transaction storage.
	 *
	 * Since it is a child of QueryContext, it will go away eventually
	 * even if we suffer an error; there's no need for special abort
	 * cleanup logic.
	 */
	vac_context = AllocSetContextCreate(QueryContext,
										"Vacuum",
										ALLOCSET_DEFAULT_MINSIZE,
										ALLOCSET_DEFAULT_INITSIZE,
										ALLOCSET_DEFAULT_MAXSIZE);
130 131

	/* vacrel gets de-allocated on xact commit, so copy it to safe storage */
132
	if (vacrel)
133 134 135 136 137 138
	{
		namestrcpy(&VacRel, vacrel);
		VacRelName = &VacRel;
	}
	else
		VacRelName = NULL;
139

140
	/* must also copy the column list, if any, to safe storage */
141
	old = MemoryContextSwitchTo(vac_context);
B
Bruce Momjian 已提交
142
	foreach(le, anal_cols)
143
	{
144
		char	   *col = (char *) lfirst(le);
145

B
Bruce Momjian 已提交
146
		anal_cols2 = lappend(anal_cols2, pstrdup(col));
147 148 149
	}
	MemoryContextSwitchTo(old);

150 151 152 153
	/*
	 * Start up the vacuum cleaner.
	 *
	 * NOTE: since this commits the current transaction, the memory holding
154 155 156
	 * any passed-in parameters gets freed here.  We must have already
	 * copied pass-by-reference parameters to safe storage.  Don't make me
	 * fix this again!
157
	 */
B
Bruce Momjian 已提交
158
	vacuum_init();
159

160
	/* vacuum the database */
B
Bruce Momjian 已提交
161
	vac_vacuum(VacRelName, analyze, anal_cols2);
162 163

	/* clean up */
B
Bruce Momjian 已提交
164
	vacuum_shutdown();
165 166 167
}

/*
B
Bruce Momjian 已提交
168
 *	vacuum_init(), vacuum_shutdown() -- start up and shut down the vacuum cleaner.
169
 *
170 171 172 173 174 175 176
 *		Formerly, there was code here to prevent more than one VACUUM from
 *		executing concurrently in the same database.  However, there's no
 *		good reason to prevent that, and manually removing lockfiles after
 *		a vacuum crash was a pain for dbadmins.  So, forget about lockfiles,
 *		and just rely on the exclusive lock we grab on each target table
 *		to ensure that there aren't two VACUUMs running on the same table
 *		at the same time.
177
 *
178 179
 *		The strangeness with committing and starting transactions in the
 *		init and shutdown routines is due to the fact that the vacuum cleaner
180
 *		is invoked via an SQL command, and so is already executing inside
181 182
 *		a transaction.	We need to leave ourselves in a predictable state
 *		on entry and exit to the vacuum cleaner.  We commit the transaction
B
Bruce Momjian 已提交
183 184
 *		started in PostgresMain() inside vacuum_init(), and start one in
 *		vacuum_shutdown() to match the commit waiting for us back in
185
 *		PostgresMain().
186 187
 */
static void
B
Bruce Momjian 已提交
188
vacuum_init()
189
{
190 191
	/* matches the StartTransaction in PostgresMain() */
	CommitTransactionCommand();
192 193 194
}

static void
B
Bruce Momjian 已提交
195
vacuum_shutdown()
196
{
197 198
	/* on entry, we are not in a transaction */

B
Bruce Momjian 已提交
199 200 201 202 203 204
	/*
	 * Flush the init file that relcache.c uses to save startup time. The
	 * next backend startup will rebuild the init file with up-to-date
	 * information from pg_class.  This lets the optimizer see the stats
	 * that we've collected for certain critical system indexes.  See
	 * relcache.c for more details.
205
	 *
B
Bruce Momjian 已提交
206 207
	 * Ignore any failure to unlink the file, since it might not be there if
	 * no backend has been started since the last vacuum...
208 209 210
	 */
	unlink(RELCACHE_INIT_FILENAME);

211 212
	/* matches the CommitTransaction in PostgresMain() */
	StartTransactionCommand();
213 214 215 216 217 218 219 220

	/*
	 * Clean up working storage --- note we must do this after
	 * StartTransactionCommand, else we might be trying to delete
	 * the active context!
	 */
	MemoryContextDelete(vac_context);
	vac_context = NULL;
221 222 223
}

/*
B
Bruce Momjian 已提交
224
 *	vac_vacuum() -- vacuum the database.
225
 *
226 227 228 229
 *		This routine builds a list of relations to vacuum, and then calls
 *		code that vacuums them one at a time.  We are careful to vacuum each
 *		relation in a separate transaction in order to avoid holding too many
 *		locks at one time.
230 231
 */
static void
B
Bruce Momjian 已提交
232
vac_vacuum(NameData *VacRelP, bool analyze, List *anal_cols2)
233
{
234 235
	VRelList	vrl,
				cur;
236 237

	/* get list of relations */
B
Bruce Momjian 已提交
238
	vrl = getrels(VacRelP);
239

240 241
	/* vacuum each heap relation */
	for (cur = vrl; cur != (VRelList) NULL; cur = cur->vrl_next)
242
	{
243
		vacuum_rel(cur->vrl_relid, analyze, false);
244 245
		/* analyze separately so locking is minimized */
		if (analyze)
246
			analyze_rel(cur->vrl_relid, anal_cols2, MESSAGE_LEVEL);
247
	}
248 249
}

250
static VRelList
B
Bruce Momjian 已提交
251
getrels(NameData *VacRelP)
252
{
B
Bruce Momjian 已提交
253 254 255 256
	Relation	rel;
	TupleDesc	tupdesc;
	HeapScanDesc scan;
	HeapTuple	tuple;
257 258 259 260 261 262 263
	VRelList	vrl,
				cur;
	Datum		d;
	char	   *rname;
	char		rkind;
	bool		n;
	bool		found = false;
B
Bruce Momjian 已提交
264
	ScanKeyData key;
265 266 267

	StartTransactionCommand();

268
	if (NameStr(*VacRelP))
269
	{
270

B
Bruce Momjian 已提交
271 272 273 274
		/*
		 * we could use the cache here, but it is clearer to use scankeys
		 * for both vacuum cases, bjm 2000/01/19
		 */
275
		char	   *nontemp_relname;
B
Bruce Momjian 已提交
276 277 278 279 280 281

		/* We must re-map temp table names bjm 2000-04-06 */
		if ((nontemp_relname =
			 get_temp_rel_by_username(NameStr(*VacRelP))) == NULL)
			nontemp_relname = NameStr(*VacRelP);

B
Bruce Momjian 已提交
282
		ScanKeyEntryInitialize(&key, 0x0, Anum_pg_class_relname,
B
Bruce Momjian 已提交
283
							   F_NAMEEQ,
B
Bruce Momjian 已提交
284
							   PointerGetDatum(nontemp_relname));
285 286 287
	}
	else
	{
B
Bruce Momjian 已提交
288
		ScanKeyEntryInitialize(&key, 0x0, Anum_pg_class_relkind,
289
							   F_CHAREQ, CharGetDatum('r'));
290
	}
291

292
	vrl = cur = (VRelList) NULL;
293

294
	rel = heap_openr(RelationRelationName, AccessShareLock);
295
	tupdesc = RelationGetDescr(rel);
296

B
Bruce Momjian 已提交
297
	scan = heap_beginscan(rel, false, SnapshotNow, 1, &key);
298

B
Bruce Momjian 已提交
299
	while (HeapTupleIsValid(tuple = heap_getnext(scan, 0)))
V
Vadim B. Mikheev 已提交
300
	{
301
		found = true;
302

B
Bruce Momjian 已提交
303
		d = heap_getattr(tuple, Anum_pg_class_relname, tupdesc, &n);
304
		rname = (char *) d;
305

B
Bruce Momjian 已提交
306
		d = heap_getattr(tuple, Anum_pg_class_relkind, tupdesc, &n);
307 308 309

		rkind = DatumGetChar(d);

B
Bruce Momjian 已提交
310
		if (rkind != RELKIND_RELATION)
311
		{
312
			elog(NOTICE, "Vacuum: can not process indecies, views and certain system tables");
313 314
			continue;
		}
315

316 317
		/* get a relation list entry for this guy */
		if (vrl == (VRelList) NULL)
318 319
			vrl = cur = (VRelList)
				MemoryContextAlloc(vac_context, sizeof(VRelListData));
320 321
		else
		{
322 323
			cur->vrl_next = (VRelList)
				MemoryContextAlloc(vac_context, sizeof(VRelListData));
324 325
			cur = cur->vrl_next;
		}
326

327
		cur->vrl_relid = tuple->t_data->t_oid;
328 329 330
		cur->vrl_next = (VRelList) NULL;
	}

B
Bruce Momjian 已提交
331
	heap_endscan(scan);
332
	heap_close(rel, AccessShareLock);
333

334 335 336
	if (!found)
		elog(NOTICE, "Vacuum: table not found");

337
	CommitTransactionCommand();
338

339
	return vrl;
340 341 342
}

/*
B
Bruce Momjian 已提交
343
 *	vacuum_rel() -- vacuum one heap relation
344
 *
345
 *		This routine vacuums a single heap, cleans out its indices, and
B
Bruce Momjian 已提交
346
 *		updates its statistics num_pages and num_tuples statistics.
347
 *
348 349 350 351 352
 *		Doing one heap at a time incurs extra overhead, since we need to
 *		check that the heap exists again just before we vacuum it.	The
 *		reason that we do this is so that vacuuming can be spread across
 *		many small transactions.  Otherwise, two-phase locking would require
 *		us to lock the entire database during one pass of the vacuum cleaner.
353 354
 */
static void
355
vacuum_rel(Oid relid, bool analyze, bool is_toastrel)
356
{
357
	Relation	onerel;
B
Bruce Momjian 已提交
358
	VacPageListData vacuum_pages; /* List of pages to vacuum and/or clean
359
								 * indices */
B
Bruce Momjian 已提交
360
	VacPageListData fraged_pages; /* List of pages with space enough for
361
								 * re-using */
B
Bruce Momjian 已提交
362
	VacPage    *vacpage;
363 364 365 366
	Relation   *Irel;
	int32		nindices,
				i;
	VRelStats  *vacrelstats;
367
	bool		reindex = false;
368
	Oid			toast_relid;
369

370 371
	if (!is_toastrel)
		StartTransactionCommand();
372

373
	/*
B
Bruce Momjian 已提交
374 375
	 * Check for user-requested abort.	Note we want this to be inside a
	 * transaction, so xact.c doesn't issue useless NOTICE.
376 377 378 379
	 */
	if (QueryCancel)
		CancelQuery();

380 381 382 383
	/*
	 * Race condition -- if the pg_class tuple has gone away since the
	 * last time we saw it, we don't need to vacuum it.
	 */
384 385 386
	if (!SearchSysCacheExists(RELOID,
							  ObjectIdGetDatum(relid),
							  0, 0, 0))
387
	{
388 389
		if (!is_toastrel)
			CommitTransactionCommand();
390 391 392
		return;
	}

393
	/*
394 395 396 397
	 * Open the class, get an exclusive lock on it, and check permissions.
	 *
	 * Note we choose to treat permissions failure as a NOTICE and keep
	 * trying to vacuum the rest of the DB --- is this appropriate?
398 399
	 */
	onerel = heap_open(relid, AccessExclusiveLock);
400

401
	if (!pg_ownercheck(GetUserId(), RelationGetRelationName(onerel),
402 403 404 405 406
					   RELNAME))
	{
		elog(NOTICE, "Skipping \"%s\" --- only table owner can VACUUM it",
			 RelationGetRelationName(onerel));
		heap_close(onerel, AccessExclusiveLock);
407 408
		if (!is_toastrel)
			CommitTransactionCommand();
409 410 411
		return;
	}

412 413 414 415 416
	/*
	 * Remember the relation'ss TOAST relation for later
	 */
	toast_relid = onerel->rd_rel->reltoastrelid;

417 418 419
	/*
	 * Set up statistics-gathering machinery.
	 */
420 421
	vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
	vacrelstats->relid = relid;
B
Bruce Momjian 已提交
422
	vacrelstats->num_pages = vacrelstats->num_tuples = 0;
423
	vacrelstats->hasindex = false;
B
Bruce Momjian 已提交
424

425 426
	GetXmaxRecent(&XmaxRecent);

427
	/* scan it */
H
Hiroshi Inoue 已提交
428
	reindex = false;
B
Bruce Momjian 已提交
429
	vacuum_pages.num_pages = fraged_pages.num_pages = 0;
B
Bruce Momjian 已提交
430
	scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
431 432
	if (IsIgnoringSystemIndexes() &&
		IsSystemRelationName(RelationGetRelationName(onerel)))
H
Hiroshi Inoue 已提交
433
		reindex = true;
434 435

	/* Now open indices */
H
Hiroshi Inoue 已提交
436
	nindices = 0;
437
	Irel = (Relation *) NULL;
438
	get_indices(onerel, &nindices, &Irel);
H
Hiroshi Inoue 已提交
439 440 441 442
	if (!Irel)
		reindex = false;
	else if (!RelationGetForm(onerel)->relhasindex)
		reindex = true;
443 444 445 446
	if (nindices > 0)
		vacrelstats->hasindex = true;
	else
		vacrelstats->hasindex = false;
H
Hiroshi Inoue 已提交
447 448 449 450 451 452 453
	if (reindex)
	{
		for (i = 0; i < nindices; i++)
			index_close(Irel[i]);
		Irel = (Relation *) NULL;
		activate_indexes_of_a_table(relid, false);
	}
454 455 456 457

	/* Clean/scan index relation(s) */
	if (Irel != (Relation *) NULL)
	{
B
Bruce Momjian 已提交
458
		if (vacuum_pages.num_pages > 0)
459 460
		{
			for (i = 0; i < nindices; i++)
461 462
				vacuum_index(&vacuum_pages, Irel[i],
							 vacrelstats->num_tuples, 0);
463 464 465
		}
		else
		{
466
			/* just scan indices to update statistic */
467
			for (i = 0; i < nindices; i++)
B
Bruce Momjian 已提交
468
				scan_index(Irel[i], vacrelstats->num_tuples);
469 470 471
		}
	}

472 473 474 475 476 477
	if (fraged_pages.num_pages > 0)
	{
		/* Try to shrink heap */
		repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
					nindices, Irel);
	}
478 479 480
	else
	{
		if (Irel != (Relation *) NULL)
B
Bruce Momjian 已提交
481
			close_indices(nindices, Irel);
482 483 484
		if (vacuum_pages.num_pages > 0)
		{
			/* Clean pages from vacuum_pages list */
B
Bruce Momjian 已提交
485
			vacuum_heap(vacrelstats, onerel, &vacuum_pages);
486 487 488 489 490 491 492 493 494 495 496 497 498 499
		}
		else
		{
			/*
			 * Flush dirty pages out to disk.  We must do this even if we
			 * didn't do anything else, because we want to ensure that all
			 * tuples have correct on-row commit status on disk (see
			 * bufmgr.c's comments for FlushRelationBuffers()).
			 */
			i = FlushRelationBuffers(onerel, vacrelstats->num_pages);
			if (i < 0)
				elog(ERROR, "VACUUM (vacuum_rel): FlushRelationBuffers returned %d",
					 i);
		}
500
	}
H
Hiroshi Inoue 已提交
501 502
	if (reindex)
		activate_indexes_of_a_table(relid, true);
503

504 505 506 507 508
	/*
	 * ok - free vacuum_pages list of reaped pages
	 *
	 * Isn't this a waste of code?  Upcoming commit should free memory, no?
	 */
B
Bruce Momjian 已提交
509
	if (vacuum_pages.num_pages > 0)
510
	{
B
Bruce Momjian 已提交
511 512 513 514 515 516
		vacpage = vacuum_pages.pagedesc;
		for (i = 0; i < vacuum_pages.num_pages; i++, vacpage++)
			pfree(*vacpage);
		pfree(vacuum_pages.pagedesc);
		if (fraged_pages.num_pages > 0)
			pfree(fraged_pages.pagedesc);
517 518
	}

519 520 521
	/* all done with this class, but hold lock until commit */
	heap_close(onerel, NoLock);

522 523
	/* update statistics in pg_class */
	update_relstats(vacrelstats->relid, vacrelstats->num_pages,
524 525
					vacrelstats->num_tuples, vacrelstats->hasindex,
					vacrelstats);
526

527 528
	/*
	 * If the relation has a secondary toast one, vacuum that too
529 530
	 * while we still hold the lock on the master table. We don't
	 * need to propagate "analyze" to it, because the toaster
531
	 * always uses hardcoded index access and statistics are
532 533 534 535 536
	 * totally unimportant for toast relations
	 */
	if (toast_relid != InvalidOid)
		vacuum_rel(toast_relid, false, true);

537
	/* next command frees attribute stats */
538 539
	if (!is_toastrel)
		CommitTransactionCommand();
540 541 542
}

/*
B
Bruce Momjian 已提交
543
 *	scan_heap() -- scan an open heap relation
544
 *
B
Bruce Momjian 已提交
545
 *		This routine sets commit times, constructs vacuum_pages list of
546
 *		empty/uninitialized pages and pages with dead tuples and
B
Bruce Momjian 已提交
547
 *		~LP_USED line pointers, constructs fraged_pages list of pages
548 549
 *		appropriate for purposes of shrinking and maintains statistics
 *		on the number of live tuples in a heap.
550 551
 */
static void
B
Bruce Momjian 已提交
552
scan_heap(VRelStats *vacrelstats, Relation onerel,
B
Bruce Momjian 已提交
553
			VacPageList vacuum_pages, VacPageList fraged_pages)
554
{
555
	BlockNumber nblocks,
556 557 558
				blkno;
	ItemId		itemid;
	Buffer		buf;
B
Bruce Momjian 已提交
559
	HeapTupleData tuple;
560 561 562 563 564 565 566 567 568
	Page		page,
				tempPage = NULL;
	OffsetNumber offnum,
				maxoff;
	bool		pgchanged,
				tupgone,
				dobufrel,
				notup;
	char	   *relname;
B
Bruce Momjian 已提交
569
	VacPage		vacpage,
570
				vp;
B
Bruce Momjian 已提交
571 572
	uint32		tups_vacuumed,
				num_tuples,
573
				nkeep,
574 575
				nunused,
				ncrash,
B
Bruce Momjian 已提交
576 577 578
				empty_pages,
				new_pages,
				changed_pages,
B
Bruce Momjian 已提交
579 580 581
				empty_end_pages;
	Size		free_size,
				usable_free_size;
582
	Size		min_tlen = MaxTupleSize;
583
	Size		max_tlen = 0;
B
Bruce Momjian 已提交
584
	int32		i;
585
	bool		do_shrinking = true;
586 587 588
	VTupleLink	vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
	int			num_vtlinks = 0;
	int			free_vtlinks = 100;
589
	struct rusage ru0;
590 591 592

	getrusage(RUSAGE_SELF, &ru0);

593
	relname = RelationGetRelationName(onerel);
594 595 596
	elog(MESSAGE_LEVEL, "--Relation %s--", relname);

	tups_vacuumed = num_tuples = nkeep = nunused = ncrash = empty_pages =
B
Bruce Momjian 已提交
597 598
		new_pages = changed_pages = empty_end_pages = 0;
	free_size = usable_free_size = 0;
599 600 601

	nblocks = RelationGetNumberOfBlocks(onerel);

B
Bruce Momjian 已提交
602 603
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
	vacpage->offsets_used = 0;
604 605 606 607 608

	for (blkno = 0; blkno < nblocks; blkno++)
	{
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);
B
Bruce Momjian 已提交
609 610
		vacpage->blkno = blkno;
		vacpage->offsets_free = 0;
611

612 613 614 615 616
		if (PageIsNew(page))
		{
			elog(NOTICE, "Rel %s: Uninitialized page %u - fixing",
				 relname, blkno);
			PageInit(page, BufferGetPageSize(buf), 0);
B
Bruce Momjian 已提交
617 618
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
			free_size += (vacpage->free - sizeof(ItemIdData));
B
Bruce Momjian 已提交
619
			new_pages++;
B
Bruce Momjian 已提交
620
			empty_end_pages++;
B
Bruce Momjian 已提交
621
			reap_page(vacuum_pages, vacpage);
622 623
			WriteBuffer(buf);
			continue;
V
Vadim B. Mikheev 已提交
624
		}
625 626

		if (PageIsEmpty(page))
627
		{
B
Bruce Momjian 已提交
628 629
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
			free_size += (vacpage->free - sizeof(ItemIdData));
B
Bruce Momjian 已提交
630
			empty_pages++;
B
Bruce Momjian 已提交
631
			empty_end_pages++;
B
Bruce Momjian 已提交
632
			reap_page(vacuum_pages, vacpage);
633 634
			ReleaseBuffer(buf);
			continue;
635 636
		}

637 638 639 640 641 642
		pgchanged = false;
		notup = true;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
643
		{
644 645 646 647 648 649 650 651
			itemid = PageGetItemId(page, offnum);

			/*
			 * Collect un-used items too - it's possible to have indices
			 * pointing here after crash.
			 */
			if (!ItemIdIsUsed(itemid))
			{
B
Bruce Momjian 已提交
652
				vacpage->offsets[vacpage->offsets_free++] = offnum;
653 654 655 656
				nunused++;
				continue;
			}

657
			tuple.t_datamcxt = NULL;
658 659 660
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
661 662
			tupgone = false;

663
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
664
			{
665
				if (tuple.t_data->t_infomask & HEAP_XMIN_INVALID)
666
					tupgone = true;
667 668 669
				else if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
					if (TransactionIdDidCommit((TransactionId)
B
Bruce Momjian 已提交
670
											   tuple.t_data->t_cmin))
671 672
					{
						tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
673
						pgchanged = true;
674 675 676 677 678 679 680 681 682 683 684 685 686 687
						tupgone = true;
					}
					else
					{
						tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
						pgchanged = true;
					}
				}
				else if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
				{
					if (!TransactionIdDidCommit((TransactionId)
												tuple.t_data->t_cmin))
					{
						tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
688
						pgchanged = true;
689 690 691 692 693 694 695 696
						tupgone = true;
					}
					else
					{
						tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
						pgchanged = true;
					}
				}
697 698
				else
				{
699
					if (TransactionIdDidAbort(tuple.t_data->t_xmin))
V
Vadim B. Mikheev 已提交
700
						tupgone = true;
701
					else if (TransactionIdDidCommit(tuple.t_data->t_xmin))
V
Vadim B. Mikheev 已提交
702
					{
703
						tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
V
Vadim B. Mikheev 已提交
704 705
						pgchanged = true;
					}
706
					else if (!TransactionIdIsInProgress(tuple.t_data->t_xmin))
V
Vadim B. Mikheev 已提交
707
					{
B
Bruce Momjian 已提交
708

V
Vadim B. Mikheev 已提交
709
						/*
710
						 * Not Aborted, Not Committed, Not in Progress -
V
Vadim B. Mikheev 已提交
711 712 713 714 715 716 717 718
						 * so it's from crashed process. - vadim 11/26/96
						 */
						ncrash++;
						tupgone = true;
					}
					else
					{
						elog(NOTICE, "Rel %s: TID %u/%u: InsertTransactionInProgress %u - can't shrink relation",
B
Bruce Momjian 已提交
719
						   relname, blkno, offnum, tuple.t_data->t_xmin);
V
Vadim B. Mikheev 已提交
720 721
						do_shrinking = false;
					}
722 723 724
				}
			}

725 726 727
			/*
			 * here we are concerned about tuples with xmin committed and
			 * xmax unknown or committed
V
Vadim B. Mikheev 已提交
728
			 */
729 730
			if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED &&
				!(tuple.t_data->t_infomask & HEAP_XMAX_INVALID))
731
			{
732
				if (tuple.t_data->t_infomask & HEAP_XMAX_COMMITTED)
733 734 735 736
				{
					if (tuple.t_data->t_infomask & HEAP_MARKED_FOR_UPDATE)
					{
						tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
737 738 739
						tuple.t_data->t_infomask &=
							~(HEAP_XMAX_COMMITTED | HEAP_MARKED_FOR_UPDATE);
						pgchanged = true;
740 741 742 743
					}
					else
						tupgone = true;
				}
744
				else if (TransactionIdDidAbort(tuple.t_data->t_xmax))
745
				{
746
					tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
747 748
					pgchanged = true;
				}
749
				else if (TransactionIdDidCommit(tuple.t_data->t_xmax))
750 751 752 753
				{
					if (tuple.t_data->t_infomask & HEAP_MARKED_FOR_UPDATE)
					{
						tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
754 755
						tuple.t_data->t_infomask &=
							~(HEAP_XMAX_COMMITTED | HEAP_MARKED_FOR_UPDATE);
756 757 758 759 760
						pgchanged = true;
					}
					else
						tupgone = true;
				}
761
				else if (!TransactionIdIsInProgress(tuple.t_data->t_xmax))
762
				{
B
Bruce Momjian 已提交
763

764 765 766 767
					/*
					 * Not Aborted, Not Committed, Not in Progress - so it
					 * from crashed process. - vadim 06/02/97
					 */
768
					tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
769 770
					tuple.t_data->t_infomask &=
						~(HEAP_XMAX_COMMITTED | HEAP_MARKED_FOR_UPDATE);
771 772 773 774 775
					pgchanged = true;
				}
				else
				{
					elog(NOTICE, "Rel %s: TID %u/%u: DeleteTransactionInProgress %u - can't shrink relation",
776
						 relname, blkno, offnum, tuple.t_data->t_xmax);
777 778
					do_shrinking = false;
				}
B
Bruce Momjian 已提交
779

780
				/*
B
Bruce Momjian 已提交
781 782
				 * If tuple is recently deleted then we must not remove it
				 * from relation.
783
				 */
784
				if (tupgone && (tuple.t_data->t_infomask & HEAP_XMIN_INVALID) == 0 && tuple.t_data->t_xmax >= XmaxRecent)
785 786 787 788 789 790 791 792
				{
					tupgone = false;
					nkeep++;
					if (!(tuple.t_data->t_infomask & HEAP_XMAX_COMMITTED))
					{
						tuple.t_data->t_infomask |= HEAP_XMAX_COMMITTED;
						pgchanged = true;
					}
B
Bruce Momjian 已提交
793

794 795 796 797 798
					/*
					 * If we do shrinking and this tuple is updated one
					 * then remember it to construct updated tuple
					 * dependencies.
					 */
B
Bruce Momjian 已提交
799 800
					if (do_shrinking && !(ItemPointerEquals(&(tuple.t_self),
											   &(tuple.t_data->t_ctid))))
801 802 803 804
					{
						if (free_vtlinks == 0)
						{
							free_vtlinks = 1000;
B
Bruce Momjian 已提交
805 806 807
							vtlinks = (VTupleLink) repalloc(vtlinks,
										   (free_vtlinks + num_vtlinks) *
												 sizeof(VTupleLinkData));
808 809 810 811 812 813 814
						}
						vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
						vtlinks[num_vtlinks].this_tid = tuple.t_self;
						free_vtlinks--;
						num_vtlinks++;
					}
				}
815 816 817 818 819
			}

			/*
			 * Other checks...
			 */
820
			if (!OidIsValid(tuple.t_data->t_oid))
821 822 823 824 825 826 827
			{
				elog(NOTICE, "Rel %s: TID %u/%u: OID IS INVALID. TUPGONE %d.",
					 relname, blkno, offnum, tupgone);
			}

			if (tupgone)
			{
828
				ItemId		lpp;
829

830 831 832 833 834 835 836 837
				/*
				 * Here we are building a temporary copy of the page with
				 * dead tuples removed.  Below we will apply
				 * PageRepairFragmentation to the copy, so that we can
				 * determine how much space will be available after
				 * removal of dead tuples.  But note we are NOT changing
				 * the real page yet...
				 */
838 839
				if (tempPage == (Page) NULL)
				{
840
					Size		pageSize;
841 842 843 844 845 846

					pageSize = PageGetPageSize(page);
					tempPage = (Page) palloc(pageSize);
					memmove(tempPage, page, pageSize);
				}

847
				/* mark it unused on the temp page */
848 849 850
				lpp = &(((PageHeader) tempPage)->pd_linp[offnum - 1]);
				lpp->lp_flags &= ~LP_USED;

B
Bruce Momjian 已提交
851
				vacpage->offsets[vacpage->offsets_free++] = offnum;
B
Bruce Momjian 已提交
852
				tups_vacuumed++;
853 854 855
			}
			else
			{
B
Bruce Momjian 已提交
856
				num_tuples++;
857
				notup = false;
858 859 860 861
				if (tuple.t_len < min_tlen)
					min_tlen = tuple.t_len;
				if (tuple.t_len > max_tlen)
					max_tlen = tuple.t_len;
862
			}
863
		}
864 865 866 867 868

		if (pgchanged)
		{
			WriteBuffer(buf);
			dobufrel = false;
B
Bruce Momjian 已提交
869
			changed_pages++;
870 871
		}
		else
872
			dobufrel = true;
B
Bruce Momjian 已提交
873

B
Bruce Momjian 已提交
874
		if (tempPage != (Page) NULL)
875 876
		{						/* Some tuples are gone */
			PageRepairFragmentation(tempPage);
B
Bruce Momjian 已提交
877 878 879
			vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
			free_size += vacpage->free;
			reap_page(vacuum_pages, vacpage);
880 881
			pfree(tempPage);
			tempPage = (Page) NULL;
882
		}
B
Bruce Momjian 已提交
883
		else if (vacpage->offsets_free > 0)
884
		{						/* there are only ~LP_USED line pointers */
B
Bruce Momjian 已提交
885 886 887
			vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
			free_size += vacpage->free;
			reap_page(vacuum_pages, vacpage);
V
Vadim B. Mikheev 已提交
888
		}
889 890 891
		if (dobufrel)
			ReleaseBuffer(buf);
		if (notup)
B
Bruce Momjian 已提交
892
			empty_end_pages++;
893
		else
B
Bruce Momjian 已提交
894
			empty_end_pages = 0;
895 896
	}

B
Bruce Momjian 已提交
897
	pfree(vacpage);
898 899

	/* save stats in the rel list for use later */
B
Bruce Momjian 已提交
900 901
	vacrelstats->num_tuples = num_tuples;
	vacrelstats->num_pages = nblocks;
902
/*	  vacrelstats->natts = attr_cnt;*/
B
Bruce Momjian 已提交
903
	if (num_tuples == 0)
904 905 906 907
		min_tlen = max_tlen = 0;
	vacrelstats->min_tlen = min_tlen;
	vacrelstats->max_tlen = max_tlen;

B
Bruce Momjian 已提交
908 909
	vacuum_pages->empty_end_pages = empty_end_pages;
	fraged_pages->empty_end_pages = empty_end_pages;
910 911

	/*
912
	 * Try to make fraged_pages keeping in mind that we can't use free
B
Bruce Momjian 已提交
913
	 * space of "empty" end-pages and last page if it reaped.
914
	 */
B
Bruce Momjian 已提交
915
	if (do_shrinking && vacuum_pages->num_pages - empty_end_pages > 0)
916
	{
917
		int			nusf;		/* blocks usefull for re-using */
918

B
Bruce Momjian 已提交
919 920
		nusf = vacuum_pages->num_pages - empty_end_pages;
		if ((vacuum_pages->pagedesc[nusf - 1])->blkno == nblocks - empty_end_pages - 1)
921 922 923 924
			nusf--;

		for (i = 0; i < nusf; i++)
		{
B
Bruce Momjian 已提交
925
			vp = vacuum_pages->pagedesc[i];
B
Bruce Momjian 已提交
926
			if (enough_space(vp, min_tlen))
927
			{
B
Bruce Momjian 已提交
928
				vpage_insert(fraged_pages, vp);
B
Bruce Momjian 已提交
929
				usable_free_size += vp->free;
930 931
			}
		}
V
Vadim B. Mikheev 已提交
932
	}
933

934 935
	if (usable_free_size > 0 && num_vtlinks > 0)
	{
B
Bruce Momjian 已提交
936
		qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
B
Bruce Momjian 已提交
937
			  vac_cmp_vtlinks);
938 939 940 941 942 943 944 945 946 947
		vacrelstats->vtlinks = vtlinks;
		vacrelstats->num_vtlinks = num_vtlinks;
	}
	else
	{
		vacrelstats->vtlinks = NULL;
		vacrelstats->num_vtlinks = 0;
		pfree(vtlinks);
	}

B
Bruce Momjian 已提交
948
	elog(MESSAGE_LEVEL, "Pages %u: Changed %u, reaped %u, Empty %u, New %u; \
949 950
Tup %u: Vac %u, Keep/VTL %u/%u, Crash %u, UnUsed %u, MinLen %lu, MaxLen %lu; \
Re-using: Free/Avail. Space %lu/%lu; EndEmpty/Avail. Pages %u/%u. %s",
B
Bruce Momjian 已提交
951
		 nblocks, changed_pages, vacuum_pages->num_pages, empty_pages,
B
Bruce Momjian 已提交
952 953
		 new_pages, num_tuples, tups_vacuumed,
		 nkeep, vacrelstats->num_vtlinks, ncrash,
954 955
		 nunused, (unsigned long)min_tlen, (unsigned long)max_tlen,
		 (unsigned long)free_size, (unsigned long)usable_free_size,
B
Bruce Momjian 已提交
956
		 empty_end_pages, fraged_pages->num_pages,
B
Bruce Momjian 已提交
957
		 show_rusage(&ru0));
958

B
Bruce Momjian 已提交
959
}
V
Vadim B. Mikheev 已提交
960

961 962

/*
B
Bruce Momjian 已提交
963
 *	repair_frag() -- try to repair relation's fragmentation
964
 *
965 966
 *		This routine marks dead tuples as unused and tries re-use dead space
 *		by moving tuples (and inserting indices if needed). It constructs
B
Bruce Momjian 已提交
967
 *		Nvacpagelist list of free-ed pages (moved tuples) and clean indices
968 969 970
 *		for them after committing (in hack-manner - without losing locks
 *		and freeing memory!) current transaction. It truncates relation
 *		if some end-blocks are gone away.
971 972
 */
static void
B
Bruce Momjian 已提交
973
repair_frag(VRelStats *vacrelstats, Relation onerel,
B
Bruce Momjian 已提交
974
			   VacPageList vacuum_pages, VacPageList fraged_pages,
B
Bruce Momjian 已提交
975
			   int nindices, Relation *Irel)
976
{
977 978 979
	TransactionId myXID;
	CommandId	myCID;
	Buffer		buf,
B
Bruce Momjian 已提交
980
				cur_buffer;
981 982 983 984 985 986 987
	int			nblocks,
				blkno;
	Page		page,
				ToPage = NULL;
	OffsetNumber offnum = 0,
				maxoff = 0,
				newoff,
B
Bruce Momjian 已提交
988
				max_offset;
989 990
	ItemId		itemid,
				newitemid;
B
Bruce Momjian 已提交
991 992
	HeapTupleData tuple,
				newtup;
993 994 995 996
	TupleDesc	tupdesc;
	IndexInfo **indexInfo = NULL;
	Datum		idatum[INDEX_MAX_KEYS];
	char		inulls[INDEX_MAX_KEYS];
997
	InsertIndexResult iresult;
B
Bruce Momjian 已提交
998 999
	VacPageListData Nvacpagelist;
	VacPage		cur_page = NULL,
B
Bruce Momjian 已提交
1000
				last_vacuum_page,
B
Bruce Momjian 已提交
1001 1002
				vacpage,
			   *curpage;
B
Bruce Momjian 已提交
1003
	int			cur_item = 0;
1004
	int			last_move_dest_block = -1,
B
Bruce Momjian 已提交
1005
				last_vacuum_block,
1006
				i = 0;
B
Bruce Momjian 已提交
1007 1008 1009 1010
	Size		tuple_len;
	int			num_moved,
				num_fraged_pages,
				vacuumed_pages;
B
Bruce Momjian 已提交
1011
	int			checked_moved,
1012 1013
				num_tuples,
				keep_tuples = 0;
1014
	bool		isempty,
1015 1016
				dowrite,
				chain_tuple_moved;
1017
	struct rusage ru0;
1018 1019 1020 1021 1022 1023

	getrusage(RUSAGE_SELF, &ru0);

	myXID = GetCurrentTransactionId();
	myCID = GetCurrentCommandId();

1024 1025
	tupdesc = RelationGetDescr(onerel);

1026
	if (Irel != (Relation *) NULL)		/* preparation for index' inserts */
1027
		indexInfo = get_index_desc(onerel, nindices, Irel);
1028

B
Bruce Momjian 已提交
1029 1030 1031 1032 1033 1034
	Nvacpagelist.num_pages = 0;
	num_fraged_pages = fraged_pages->num_pages;
	Assert(vacuum_pages->num_pages > vacuum_pages->empty_end_pages);
	vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
	last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
	last_vacuum_block = last_vacuum_page->blkno;
B
Bruce Momjian 已提交
1035 1036
	cur_buffer = InvalidBuffer;
	num_moved = 0;
1037

B
Bruce Momjian 已提交
1038 1039
	vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
	vacpage->offsets_used = vacpage->offsets_free = 0;
1040

1041 1042
	/*
	 * Scan pages backwards from the last nonempty page, trying to move
B
Bruce Momjian 已提交
1043 1044
	 * tuples down to lower pages.	Quit when we reach a page that we have
	 * moved any tuples onto.  Note that if a page is still in the
1045 1046 1047 1048
	 * fraged_pages list (list of candidate move-target pages) when we
	 * reach it, we will remove it from the list.  This ensures we never
	 * move a tuple up to a higher page number.
	 *
B
Bruce Momjian 已提交
1049 1050
	 * NB: this code depends on the vacuum_pages and fraged_pages lists being
	 * in order, and on fraged_pages being a subset of vacuum_pages.
1051
	 */
B
Bruce Momjian 已提交
1052
	nblocks = vacrelstats->num_pages;
B
Bruce Momjian 已提交
1053
	for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1054 1055
		 blkno > last_move_dest_block;
		 blkno--)
V
Vadim B. Mikheev 已提交
1056
	{
1057 1058 1059
		buf = ReadBuffer(onerel, blkno);
		page = BufferGetPage(buf);

B
Bruce Momjian 已提交
1060
		vacpage->offsets_free = 0;
1061 1062 1063 1064

		isempty = PageIsEmpty(page);

		dowrite = false;
B
Bruce Momjian 已提交
1065
		if (blkno == last_vacuum_block) /* it's reaped page */
1066
		{
B
Bruce Momjian 已提交
1067
			if (last_vacuum_page->offsets_free > 0) /* there are dead tuples */
1068 1069
			{					/* on this page - clean */
				Assert(!isempty);
B
Bruce Momjian 已提交
1070
				vacuum_page(page, last_vacuum_page);
1071 1072 1073 1074
				dowrite = true;
			}
			else
				Assert(isempty);
B
Bruce Momjian 已提交
1075
			--vacuumed_pages;
1076 1077
			if (vacuumed_pages > 0)
			{
B
Bruce Momjian 已提交
1078
				/* get prev reaped page from vacuum_pages */
B
Bruce Momjian 已提交
1079 1080
				last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
				last_vacuum_block = last_vacuum_page->blkno;
1081 1082
			}
			else
1083
			{
1084 1085 1086 1087
				last_vacuum_page = NULL;
				last_vacuum_block = -1;
			}
			if (num_fraged_pages > 0 &&
B
Bruce Momjian 已提交
1088
			fraged_pages->pagedesc[num_fraged_pages - 1]->blkno ==
1089
				(BlockNumber) blkno)
1090 1091
			{
				/* page is in fraged_pages too; remove it */
B
Bruce Momjian 已提交
1092
				--num_fraged_pages;
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102
			}
			if (isempty)
			{
				ReleaseBuffer(buf);
				continue;
			}
		}
		else
			Assert(!isempty);

B
Bruce Momjian 已提交
1103 1104
		chain_tuple_moved = false;		/* no one chain-tuple was moved
										 * off this page, yet */
B
Bruce Momjian 已提交
1105
		vacpage->blkno = blkno;
1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			itemid = PageGetItemId(page, offnum);

			if (!ItemIdIsUsed(itemid))
				continue;

1116
			tuple.t_datamcxt = NULL;
1117 1118 1119
			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple_len = tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);
1120

1121 1122
			if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
			{
B
Bruce Momjian 已提交
1123
				if ((TransactionId) tuple.t_data->t_cmin != myXID)
1124 1125
					elog(ERROR, "Invalid XID in t_cmin");
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1126
					elog(ERROR, "HEAP_MOVED_IN was not expected");
B
Bruce Momjian 已提交
1127 1128 1129

				/*
				 * If this (chain) tuple is moved by me already then I
B
Bruce Momjian 已提交
1130
				 * have to check is it in vacpage or not - i.e. is it moved
B
Bruce Momjian 已提交
1131
				 * while cleaning this page or some previous one.
1132 1133 1134 1135 1136
				 */
				if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
					if (keep_tuples == 0)
						continue;
B
Bruce Momjian 已提交
1137 1138 1139
					if (chain_tuple_moved)		/* some chains was moved
												 * while */
					{			/* cleaning this page */
B
Bruce Momjian 已提交
1140 1141
						Assert(vacpage->offsets_free > 0);
						for (i = 0; i < vacpage->offsets_free; i++)
1142
						{
B
Bruce Momjian 已提交
1143
							if (vacpage->offsets[i] == offnum)
1144 1145
								break;
						}
B
Bruce Momjian 已提交
1146
						if (i >= vacpage->offsets_free) /* not found */
1147
						{
B
Bruce Momjian 已提交
1148
							vacpage->offsets[vacpage->offsets_free++] = offnum;
1149 1150 1151 1152 1153
							keep_tuples--;
						}
					}
					else
					{
B
Bruce Momjian 已提交
1154
						vacpage->offsets[vacpage->offsets_free++] = offnum;
1155 1156 1157 1158 1159
						keep_tuples--;
					}
					continue;
				}
				elog(ERROR, "HEAP_MOVED_OFF was expected");
1160 1161 1162
			}

			/*
B
Bruce Momjian 已提交
1163 1164 1165
			 * If this tuple is in the chain of tuples created in updates
			 * by "recent" transactions then we have to move all chain of
			 * tuples to another places.
1166
			 */
B
Bruce Momjian 已提交
1167
			if ((tuple.t_data->t_infomask & HEAP_UPDATED &&
1168
				 tuple.t_data->t_xmin >= XmaxRecent) ||
B
Bruce Momjian 已提交
1169
				(!(tuple.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1170 1171
				 !(ItemPointerEquals(&(tuple.t_self), &(tuple.t_data->t_ctid)))))
			{
B
Bruce Momjian 已提交
1172 1173 1174 1175 1176 1177 1178
				Buffer		Cbuf = buf;
				Page		Cpage;
				ItemId		Citemid;
				ItemPointerData Ctid;
				HeapTupleData tp = tuple;
				Size		tlen = tuple_len;
				VTupleMove	vtmove = (VTupleMove)
B
Bruce Momjian 已提交
1179
				palloc(100 * sizeof(VTupleMoveData));
B
Bruce Momjian 已提交
1180 1181
				int			num_vtmove = 0;
				int			free_vtmove = 100;
B
Bruce Momjian 已提交
1182
				VacPage		to_vacpage = NULL;
B
Bruce Momjian 已提交
1183 1184 1185
				int			to_item = 0;
				bool		freeCbuf = false;
				int			ti;
1186 1187 1188 1189 1190 1191 1192 1193

				if (vacrelstats->vtlinks == NULL)
					elog(ERROR, "No one parent tuple was found");
				if (cur_buffer != InvalidBuffer)
				{
					WriteBuffer(cur_buffer);
					cur_buffer = InvalidBuffer;
				}
B
Bruce Momjian 已提交
1194

1195
				/*
B
Bruce Momjian 已提交
1196 1197
				 * If this tuple is in the begin/middle of the chain then
				 * we have to move to the end of chain.
1198
				 */
B
Bruce Momjian 已提交
1199 1200
				while (!(tp.t_data->t_infomask & HEAP_XMAX_INVALID) &&
				!(ItemPointerEquals(&(tp.t_self), &(tp.t_data->t_ctid))))
1201 1202 1203 1204 1205
				{
					Ctid = tp.t_data->t_ctid;
					if (freeCbuf)
						ReleaseBuffer(Cbuf);
					freeCbuf = true;
B
Bruce Momjian 已提交
1206 1207
					Cbuf = ReadBuffer(onerel,
									  ItemPointerGetBlockNumber(&Ctid));
1208
					Cpage = BufferGetPage(Cbuf);
B
Bruce Momjian 已提交
1209 1210
					Citemid = PageGetItemId(Cpage,
									  ItemPointerGetOffsetNumber(&Ctid));
1211
					if (!ItemIdIsUsed(Citemid))
1212
					{
B
Bruce Momjian 已提交
1213

1214
						/*
B
Bruce Momjian 已提交
1215 1216 1217 1218 1219
						 * This means that in the middle of chain there
						 * was tuple updated by older (than XmaxRecent)
						 * xaction and this tuple is already deleted by
						 * me. Actually, upper part of chain should be
						 * removed and seems that this should be handled
B
Bruce Momjian 已提交
1220
						 * in scan_heap(), but it's not implemented at
B
Bruce Momjian 已提交
1221
						 * the moment and so we just stop shrinking here.
1222 1223 1224 1225
						 */
						ReleaseBuffer(Cbuf);
						pfree(vtmove);
						vtmove = NULL;
B
Bruce Momjian 已提交
1226
						elog(NOTICE, "Child itemid in update-chain marked as unused - can't continue repair_frag");
1227 1228
						break;
					}
1229
					tp.t_datamcxt = NULL;
1230 1231 1232 1233
					tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
					tp.t_self = Ctid;
					tlen = tp.t_len = ItemIdGetLength(Citemid);
				}
1234 1235
				if (vtmove == NULL)
					break;
1236
				/* first, can chain be moved ? */
B
Bruce Momjian 已提交
1237
				for (;;)
1238
				{
B
Bruce Momjian 已提交
1239 1240
					if (to_vacpage == NULL ||
						!enough_space(to_vacpage, tlen))
1241
					{
B
Bruce Momjian 已提交
1242 1243

						/*
B
Bruce Momjian 已提交
1244
						 * if to_vacpage no longer has enough free space to be
1245 1246
						 * useful, remove it from fraged_pages list
						 */
B
Bruce Momjian 已提交
1247 1248
						if (to_vacpage != NULL &&
						 !enough_space(to_vacpage, vacrelstats->min_tlen))
1249
						{
1250
							Assert(num_fraged_pages > to_item);
B
Bruce Momjian 已提交
1251 1252 1253
							memmove(fraged_pages->pagedesc + to_item,
								fraged_pages->pagedesc + to_item + 1,
									sizeof(VacPage) * (num_fraged_pages - to_item - 1));
1254 1255 1256 1257
							num_fraged_pages--;
						}
						for (i = 0; i < num_fraged_pages; i++)
						{
B
Bruce Momjian 已提交
1258
							if (enough_space(fraged_pages->pagedesc[i], tlen))
1259 1260
								break;
						}
B
Bruce Momjian 已提交
1261

B
Bruce Momjian 已提交
1262
						/* can't move item anywhere */
B
Bruce Momjian 已提交
1263
						if (i == num_fraged_pages)
B
Bruce Momjian 已提交
1264
						{
1265 1266
							for (i = 0; i < num_vtmove; i++)
							{
B
Bruce Momjian 已提交
1267 1268
								Assert(vtmove[i].vacpage->offsets_used > 0);
								(vtmove[i].vacpage->offsets_used)--;
1269 1270 1271 1272 1273
							}
							num_vtmove = 0;
							break;
						}
						to_item = i;
B
Bruce Momjian 已提交
1274
						to_vacpage = fraged_pages->pagedesc[to_item];
1275
					}
B
Bruce Momjian 已提交
1276 1277 1278 1279
					to_vacpage->free -= MAXALIGN(tlen);
					if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
						to_vacpage->free -= MAXALIGN(sizeof(ItemIdData));
					(to_vacpage->offsets_used)++;
1280 1281 1282
					if (free_vtmove == 0)
					{
						free_vtmove = 1000;
B
Bruce Momjian 已提交
1283 1284 1285
						vtmove = (VTupleMove) repalloc(vtmove,
											 (free_vtmove + num_vtmove) *
												 sizeof(VTupleMoveData));
1286 1287
					}
					vtmove[num_vtmove].tid = tp.t_self;
B
Bruce Momjian 已提交
1288 1289
					vtmove[num_vtmove].vacpage = to_vacpage;
					if (to_vacpage->offsets_used == 1)
1290 1291 1292 1293 1294
						vtmove[num_vtmove].cleanVpd = true;
					else
						vtmove[num_vtmove].cleanVpd = false;
					free_vtmove--;
					num_vtmove++;
B
Bruce Momjian 已提交
1295

B
Bruce Momjian 已提交
1296
					/* All done ? */
B
Bruce Momjian 已提交
1297 1298
					if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
						tp.t_data->t_xmin < XmaxRecent)
1299
						break;
B
Bruce Momjian 已提交
1300

B
Bruce Momjian 已提交
1301
					/* Well, try to find tuple with old row version */
B
Bruce Momjian 已提交
1302
					for (;;)
1303
					{
B
Bruce Momjian 已提交
1304 1305 1306 1307 1308 1309
						Buffer		Pbuf;
						Page		Ppage;
						ItemId		Pitemid;
						HeapTupleData Ptp;
						VTupleLinkData vtld,
								   *vtlp;
1310 1311

						vtld.new_tid = tp.t_self;
B
Bruce Momjian 已提交
1312
						vtlp = (VTupleLink)
B
Bruce Momjian 已提交
1313
							vac_find_eq((void *) (vacrelstats->vtlinks),
B
Bruce Momjian 已提交
1314 1315 1316
									   vacrelstats->num_vtlinks,
									   sizeof(VTupleLinkData),
									   (void *) &vtld,
B
Bruce Momjian 已提交
1317
									   vac_cmp_vtlinks);
1318 1319 1320
						if (vtlp == NULL)
							elog(ERROR, "Parent tuple was not found");
						tp.t_self = vtlp->this_tid;
B
Bruce Momjian 已提交
1321
						Pbuf = ReadBuffer(onerel,
1322 1323
								ItemPointerGetBlockNumber(&(tp.t_self)));
						Ppage = BufferGetPage(Pbuf);
B
Bruce Momjian 已提交
1324 1325
						Pitemid = PageGetItemId(Ppage,
							   ItemPointerGetOffsetNumber(&(tp.t_self)));
1326 1327
						if (!ItemIdIsUsed(Pitemid))
							elog(ERROR, "Parent itemid marked as unused");
1328
						Ptp.t_datamcxt = NULL;
1329
						Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
1330
						Assert(ItemPointerEquals(&(vtld.new_tid),
B
Bruce Momjian 已提交
1331 1332
												 &(Ptp.t_data->t_ctid)));

1333
						/*
B
Bruce Momjian 已提交
1334 1335 1336 1337 1338 1339 1340 1341 1342
						 * Read above about cases when
						 * !ItemIdIsUsed(Citemid) (child item is
						 * removed)... Due to the fact that at the moment
						 * we don't remove unuseful part of update-chain,
						 * it's possible to get too old parent row here.
						 * Like as in the case which caused this problem,
						 * we stop shrinking here. I could try to find
						 * real parent row but want not to do it because
						 * of real solution will be implemented anyway,
1343 1344
						 * latter, and we are too close to 6.5 release. -
						 * vadim 06/11/99
1345 1346 1347 1348 1349 1350 1351 1352 1353
						 */
						if (Ptp.t_data->t_xmax != tp.t_data->t_xmin)
						{
							if (freeCbuf)
								ReleaseBuffer(Cbuf);
							freeCbuf = false;
							ReleaseBuffer(Pbuf);
							for (i = 0; i < num_vtmove; i++)
							{
B
Bruce Momjian 已提交
1354 1355
								Assert(vtmove[i].vacpage->offsets_used > 0);
								(vtmove[i].vacpage->offsets_used)--;
1356 1357
							}
							num_vtmove = 0;
B
Bruce Momjian 已提交
1358
							elog(NOTICE, "Too old parent tuple found - can't continue repair_frag");
1359 1360
							break;
						}
B
Bruce Momjian 已提交
1361 1362 1363
#ifdef NOT_USED					/* I'm not sure that this will wotk
								 * properly... */

1364
						/*
B
Bruce Momjian 已提交
1365 1366 1367 1368
						 * If this tuple is updated version of row and it
						 * was created by the same transaction then no one
						 * is interested in this tuple - mark it as
						 * removed.
1369
						 */
B
Bruce Momjian 已提交
1370
						if (Ptp.t_data->t_infomask & HEAP_UPDATED &&
1371 1372
							Ptp.t_data->t_xmin == Ptp.t_data->t_xmax)
						{
B
Bruce Momjian 已提交
1373 1374 1375 1376
							TransactionIdStore(myXID,
								(TransactionId *) &(Ptp.t_data->t_cmin));
							Ptp.t_data->t_infomask &=
								~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1377 1378 1379 1380
							Ptp.t_data->t_infomask |= HEAP_MOVED_OFF;
							WriteBuffer(Pbuf);
							continue;
						}
1381
#endif
1382
						tp.t_datamcxt = Ptp.t_datamcxt;
1383 1384 1385 1386 1387 1388 1389 1390
						tp.t_data = Ptp.t_data;
						tlen = tp.t_len = ItemIdGetLength(Pitemid);
						if (freeCbuf)
							ReleaseBuffer(Cbuf);
						Cbuf = Pbuf;
						freeCbuf = true;
						break;
					}
1391 1392
					if (num_vtmove == 0)
						break;
1393 1394 1395
				}
				if (freeCbuf)
					ReleaseBuffer(Cbuf);
B
Bruce Momjian 已提交
1396
				if (num_vtmove == 0)	/* chain can't be moved */
1397 1398 1399 1400 1401 1402 1403
				{
					pfree(vtmove);
					break;
				}
				ItemPointerSetInvalid(&Ctid);
				for (ti = 0; ti < num_vtmove; ti++)
				{
B
Bruce Momjian 已提交
1404
					VacPage	destvacpage = vtmove[ti].vacpage;
1405

V
Vadim B. Mikheev 已提交
1406
					/* Get page to move from */
1407
					tuple.t_self = vtmove[ti].tid;
B
Bruce Momjian 已提交
1408 1409
					Cbuf = ReadBuffer(onerel,
							 ItemPointerGetBlockNumber(&(tuple.t_self)));
V
Vadim B. Mikheev 已提交
1410 1411 1412 1413 1414 1415 1416 1417 1418

					/* Get page to move to */
					cur_buffer = ReadBuffer(onerel, destvacpage->blkno);

					LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
					if (cur_buffer != Cbuf)
						LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);

					ToPage = BufferGetPage(cur_buffer);
1419
					Cpage = BufferGetPage(Cbuf);
V
Vadim B. Mikheev 已提交
1420 1421 1422

					/* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */

B
Bruce Momjian 已提交
1423
					Citemid = PageGetItemId(Cpage,
1424
							ItemPointerGetOffsetNumber(&(tuple.t_self)));
1425
					tuple.t_datamcxt = NULL;
1426 1427
					tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
					tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441

					/*
					 * make a copy of the source tuple, and then mark the
					 * source tuple MOVED_OFF.
					 */
					heap_copytuple_with_tuple(&tuple, &newtup);

					RelationInvalidateHeapTuple(onerel, &tuple);

					TransactionIdStore(myXID, (TransactionId *) &(tuple.t_data->t_cmin));
					tuple.t_data->t_infomask &=
						~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
					tuple.t_data->t_infomask |= HEAP_MOVED_OFF;

1442 1443 1444
					/*
					 * If this page was not used before - clean it.
					 *
1445 1446 1447 1448 1449 1450 1451 1452 1453
					 * NOTE: a nasty bug used to lurk here.  It is possible
					 * for the source and destination pages to be the same
					 * (since this tuple-chain member can be on a page lower
					 * than the one we're currently processing in the outer
					 * loop).  If that's true, then after vacuum_page() the
					 * source tuple will have been moved, and tuple.t_data
					 * will be pointing at garbage.  Therefore we must do
					 * everything that uses tuple.t_data BEFORE this step!!
					 *
1454
					 * This path is different from the other callers of
B
Bruce Momjian 已提交
1455
					 * vacuum_page, because we have already incremented the
B
Bruce Momjian 已提交
1456
					 * vacpage's offsets_used field to account for the
1457
					 * tuple(s) we expect to move onto the page. Therefore
B
Bruce Momjian 已提交
1458
					 * vacuum_page's check for offsets_used == 0 is
1459 1460 1461
					 * wrong. But since that's a good debugging check for
					 * all other callers, we work around it here rather
					 * than remove it.
1462
					 */
V
Vadim B. Mikheev 已提交
1463
					if (!PageIsEmpty(ToPage) && vtmove[ti].cleanVpd)
1464
					{
B
Bruce Momjian 已提交
1465
						int			sv_offsets_used = destvacpage->offsets_used;
1466

B
Bruce Momjian 已提交
1467 1468 1469
						destvacpage->offsets_used = 0;
						vacuum_page(ToPage, destvacpage);
						destvacpage->offsets_used = sv_offsets_used;
1470
					}
1471 1472 1473 1474 1475

					/*
					 * Update the state of the copied tuple, and store it
					 * on the destination page.
					 */
B
Bruce Momjian 已提交
1476 1477 1478
					TransactionIdStore(myXID, (TransactionId *) &(newtup.t_data->t_cmin));
					newtup.t_data->t_infomask &=
						~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1479 1480
					newtup.t_data->t_infomask |= HEAP_MOVED_IN;
					newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
B
Bruce Momjian 已提交
1481
										 InvalidOffsetNumber, LP_USED);
1482 1483
					if (newoff == InvalidOffsetNumber)
					{
1484 1485
						elog(STOP, "moving chain: failed to add item with len = %lu to page %u",
							 (unsigned long)tuple_len, destvacpage->blkno);
1486 1487 1488
					}
					newitemid = PageGetItemId(ToPage, newoff);
					pfree(newtup.t_data);
1489
					newtup.t_datamcxt = NULL;
1490
					newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
B
Bruce Momjian 已提交
1491
					ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
V
Vadim B. Mikheev 已提交
1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505

					{
						XLogRecPtr	recptr = 
							log_heap_move(onerel, tuple.t_self, &newtup);

						if (Cbuf != cur_buffer)
						{
							PageSetLSN(Cpage, recptr);
							PageSetSUI(Cpage, ThisStartUpID);
						}
						PageSetLSN(ToPage, recptr);
						PageSetSUI(ToPage, ThisStartUpID);
					}

B
Bruce Momjian 已提交
1506 1507
					if (((int) destvacpage->blkno) > last_move_dest_block)
						last_move_dest_block = destvacpage->blkno;
B
Bruce Momjian 已提交
1508

1509
					/*
1510 1511
					 * Set new tuple's t_ctid pointing to itself for last
					 * tuple in chain, and to next tuple in chain otherwise.
1512 1513 1514 1515 1516 1517 1518 1519
					 */
					if (!ItemPointerIsValid(&Ctid))
						newtup.t_data->t_ctid = newtup.t_self;
					else
						newtup.t_data->t_ctid = Ctid;
					Ctid = newtup.t_self;

					num_moved++;
B
Bruce Momjian 已提交
1520

1521 1522 1523 1524
					/*
					 * Remember that we moved tuple from the current page
					 * (corresponding index tuple will be cleaned).
					 */
1525
					if (Cbuf == buf)
B
Bruce Momjian 已提交
1526
						vacpage->offsets[vacpage->offsets_free++] =
B
Bruce Momjian 已提交
1527
							ItemPointerGetOffsetNumber(&(tuple.t_self));
1528 1529
					else
						keep_tuples++;
1530

V
Vadim B. Mikheev 已提交
1531 1532 1533 1534
					LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
					if (cur_buffer != Cbuf)
						LockBuffer(Cbuf, BUFFER_LOCK_UNLOCK);

1535 1536
					if (Irel != (Relation *) NULL)
					{
1537 1538 1539 1540 1541 1542 1543 1544 1545
						/*
						 * XXX using CurrentMemoryContext here means
						 * intra-vacuum memory leak for functional indexes.
						 * Should fix someday.
						 *
						 * XXX This code fails to handle partial indexes!
						 * Probably should change it to use ExecOpenIndices.
						 */
						for (i = 0; i < nindices; i++)
1546
						{
1547
							FormIndexDatum(indexInfo[i],
B
Bruce Momjian 已提交
1548 1549
										   &newtup,
										   tupdesc,
1550
										   CurrentMemoryContext,
B
Bruce Momjian 已提交
1551
										   idatum,
1552
										   inulls);
1553 1554 1555 1556 1557 1558 1559 1560 1561 1562
							iresult = index_insert(Irel[i],
												   idatum,
												   inulls,
												   &newtup.t_self,
												   onerel);
							if (iresult)
								pfree(iresult);
						}
					}
					WriteBuffer(cur_buffer);
1563
					WriteBuffer(Cbuf);
1564 1565 1566
				}
				cur_buffer = InvalidBuffer;
				pfree(vtmove);
1567
				chain_tuple_moved = true;
1568 1569 1570
				continue;
			}

1571
			/* try to find new page for this tuple */
B
Bruce Momjian 已提交
1572
			if (cur_buffer == InvalidBuffer ||
B
Bruce Momjian 已提交
1573
				!enough_space(cur_page, tuple_len))
1574
			{
B
Bruce Momjian 已提交
1575
				if (cur_buffer != InvalidBuffer)
1576
				{
B
Bruce Momjian 已提交
1577 1578
					WriteBuffer(cur_buffer);
					cur_buffer = InvalidBuffer;
B
Bruce Momjian 已提交
1579

1580
					/*
B
Bruce Momjian 已提交
1581 1582
					 * If previous target page is now too full to add *any*
					 * tuple to it, remove it from fraged_pages.
1583
					 */
B
Bruce Momjian 已提交
1584
					if (!enough_space(cur_page, vacrelstats->min_tlen))
1585
					{
1586
						Assert(num_fraged_pages > cur_item);
B
Bruce Momjian 已提交
1587 1588 1589
						memmove(fraged_pages->pagedesc + cur_item,
								fraged_pages->pagedesc + cur_item + 1,
								sizeof(VacPage) * (num_fraged_pages - cur_item - 1));
B
Bruce Momjian 已提交
1590
						num_fraged_pages--;
1591 1592
					}
				}
B
Bruce Momjian 已提交
1593
				for (i = 0; i < num_fraged_pages; i++)
1594
				{
B
Bruce Momjian 已提交
1595
					if (enough_space(fraged_pages->pagedesc[i], tuple_len))
1596 1597
						break;
				}
B
Bruce Momjian 已提交
1598
				if (i == num_fraged_pages)
1599
					break;		/* can't move item anywhere */
B
Bruce Momjian 已提交
1600
				cur_item = i;
B
Bruce Momjian 已提交
1601 1602
				cur_page = fraged_pages->pagedesc[cur_item];
				cur_buffer = ReadBuffer(onerel, cur_page->blkno);
V
Vadim B. Mikheev 已提交
1603
				LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
B
Bruce Momjian 已提交
1604
				ToPage = BufferGetPage(cur_buffer);
1605
				/* if this page was not used before - clean it */
B
Bruce Momjian 已提交
1606
				if (!PageIsEmpty(ToPage) && cur_page->offsets_used == 0)
B
Bruce Momjian 已提交
1607
					vacuum_page(ToPage, cur_page);
1608
			}
V
Vadim B. Mikheev 已提交
1609 1610 1611 1612
			else
				LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);

			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1613 1614

			/* copy tuple */
1615
			heap_copytuple_with_tuple(&tuple, &newtup);
1616

1617
			RelationInvalidateHeapTuple(onerel, &tuple);
1618

B
Bruce Momjian 已提交
1619 1620 1621
			/*
			 * Mark new tuple as moved_in by vacuum and store vacuum XID
			 * in t_cmin !!!
1622
			 */
B
Bruce Momjian 已提交
1623 1624 1625
			TransactionIdStore(myXID, (TransactionId *) &(newtup.t_data->t_cmin));
			newtup.t_data->t_infomask &=
				~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1626
			newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1627 1628

			/* add tuple to the page */
1629
			newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
1630 1631 1632
								 InvalidOffsetNumber, LP_USED);
			if (newoff == InvalidOffsetNumber)
			{
1633
				elog(ERROR, "\
1634 1635
failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
					 (unsigned long)tuple_len, cur_page->blkno, (unsigned long)cur_page->free,
B
Bruce Momjian 已提交
1636
				 cur_page->offsets_used, cur_page->offsets_free);
1637 1638
			}
			newitemid = PageGetItemId(ToPage, newoff);
1639
			pfree(newtup.t_data);
1640
			newtup.t_datamcxt = NULL;
1641
			newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
B
Bruce Momjian 已提交
1642
			ItemPointerSet(&(newtup.t_data->t_ctid), cur_page->blkno, newoff);
1643
			newtup.t_self = newtup.t_data->t_ctid;
1644

B
Bruce Momjian 已提交
1645 1646 1647
			/*
			 * Mark old tuple as moved_off by vacuum and store vacuum XID
			 * in t_cmin !!!
1648
			 */
B
Bruce Momjian 已提交
1649 1650 1651
			TransactionIdStore(myXID, (TransactionId *) &(tuple.t_data->t_cmin));
			tuple.t_data->t_infomask &=
				~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1652
			tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1653

V
Vadim B. Mikheev 已提交
1654 1655 1656 1657 1658 1659 1660 1661 1662 1663
			{
				XLogRecPtr	recptr = 
					log_heap_move(onerel, tuple.t_self, &newtup);

				PageSetLSN(page, recptr);
				PageSetSUI(page, ThisStartUpID);
				PageSetLSN(ToPage, recptr);
				PageSetSUI(ToPage, ThisStartUpID);
			}

B
Bruce Momjian 已提交
1664
			cur_page->offsets_used++;
B
Bruce Momjian 已提交
1665
			num_moved++;
B
Bruce Momjian 已提交
1666 1667 1668
			cur_page->free = ((PageHeader) ToPage)->pd_upper - ((PageHeader) ToPage)->pd_lower;
			if (((int) cur_page->blkno) > last_move_dest_block)
				last_move_dest_block = cur_page->blkno;
1669

B
Bruce Momjian 已提交
1670
			vacpage->offsets[vacpage->offsets_free++] = offnum;
1671

V
Vadim B. Mikheev 已提交
1672 1673 1674
			LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);

1675 1676 1677
			/* insert index' tuples if needed */
			if (Irel != (Relation *) NULL)
			{
1678 1679 1680 1681 1682 1683 1684 1685 1686
				/*
				 * XXX using CurrentMemoryContext here means
				 * intra-vacuum memory leak for functional indexes.
				 * Should fix someday.
				 *
				 * XXX This code fails to handle partial indexes!
				 * Probably should change it to use ExecOpenIndices.
				 */
				for (i = 0; i < nindices; i++)
1687
				{
1688
					FormIndexDatum(indexInfo[i],
1689
								   &newtup,
1690
								   tupdesc,
1691
								   CurrentMemoryContext,
1692
								   idatum,
1693
								   inulls);
B
Hi all,  
Bruce Momjian 已提交
1694
					iresult = index_insert(Irel[i],
1695 1696
										   idatum,
										   inulls,
1697
										   &newtup.t_self,
1698 1699 1700 1701 1702 1703
										   onerel);
					if (iresult)
						pfree(iresult);
				}
			}

B
Bruce Momjian 已提交
1704
		}						/* walk along page */
1705

1706 1707
		if (offnum < maxoff && keep_tuples > 0)
		{
B
Bruce Momjian 已提交
1708
			OffsetNumber off;
1709 1710

			for (off = OffsetNumberNext(offnum);
B
Bruce Momjian 已提交
1711 1712
				 off <= maxoff;
				 off = OffsetNumberNext(off))
1713 1714 1715 1716
			{
				itemid = PageGetItemId(page, off);
				if (!ItemIdIsUsed(itemid))
					continue;
1717
				tuple.t_datamcxt = NULL;
1718 1719 1720
				tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
				if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)
					continue;
B
Bruce Momjian 已提交
1721
				if ((TransactionId) tuple.t_data->t_cmin != myXID)
1722 1723 1724 1725 1726
					elog(ERROR, "Invalid XID in t_cmin (4)");
				if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
					elog(ERROR, "HEAP_MOVED_IN was not expected (2)");
				if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
				{
B
Bruce Momjian 已提交
1727
					/* some chains was moved while */
B
Bruce Momjian 已提交
1728 1729
					if (chain_tuple_moved)
					{			/* cleaning this page */
B
Bruce Momjian 已提交
1730 1731
						Assert(vacpage->offsets_free > 0);
						for (i = 0; i < vacpage->offsets_free; i++)
1732
						{
B
Bruce Momjian 已提交
1733
							if (vacpage->offsets[i] == off)
1734 1735
								break;
						}
B
Bruce Momjian 已提交
1736
						if (i >= vacpage->offsets_free) /* not found */
1737
						{
B
Bruce Momjian 已提交
1738
							vacpage->offsets[vacpage->offsets_free++] = off;
1739 1740 1741 1742 1743 1744
							Assert(keep_tuples > 0);
							keep_tuples--;
						}
					}
					else
					{
B
Bruce Momjian 已提交
1745
						vacpage->offsets[vacpage->offsets_free++] = off;
1746 1747 1748 1749 1750 1751 1752
						Assert(keep_tuples > 0);
						keep_tuples--;
					}
				}
			}
		}

B
Bruce Momjian 已提交
1753
		if (vacpage->offsets_free > 0)	/* some tuples were moved */
V
Vadim B. Mikheev 已提交
1754
		{
1755 1756
			if (chain_tuple_moved)		/* else - they are ordered */
			{
B
Bruce Momjian 已提交
1757
				qsort((char *) (vacpage->offsets), vacpage->offsets_free,
B
Bruce Momjian 已提交
1758
					  sizeof(OffsetNumber), vac_cmp_offno);
1759
			}
B
Bruce Momjian 已提交
1760
			reap_page(&Nvacpagelist, vacpage);
1761
			WriteBuffer(buf);
V
Vadim B. Mikheev 已提交
1762
		}
1763 1764 1765 1766
		else if (dowrite)
			WriteBuffer(buf);
		else
			ReleaseBuffer(buf);
V
Vadim B. Mikheev 已提交
1767

1768 1769 1770 1771 1772 1773 1774
		if (offnum <= maxoff)
			break;				/* some item(s) left */

	}							/* walk along relation */

	blkno++;					/* new number of blocks */

B
Bruce Momjian 已提交
1775
	if (cur_buffer != InvalidBuffer)
V
Vadim B. Mikheev 已提交
1776
	{
B
Bruce Momjian 已提交
1777 1778
		Assert(num_moved > 0);
		WriteBuffer(cur_buffer);
V
Vadim B. Mikheev 已提交
1779
	}
1780

B
Bruce Momjian 已提交
1781
	if (num_moved > 0)
V
Vadim B. Mikheev 已提交
1782
	{
1783 1784 1785
		/*
		 * We have to commit our tuple' movings before we'll truncate
		 * relation, but we shouldn't lose our locks. And so - quick hack:
V
Vadim B. Mikheev 已提交
1786
		 * record status of current transaction as committed, and continue.
1787
		 */
V
Vadim B. Mikheev 已提交
1788
		RecordTransactionCommit();
V
Vadim B. Mikheev 已提交
1789
	}
1790 1791

	/*
B
Bruce Momjian 已提交
1792
	 * Clean uncleaned reaped pages from vacuum_pages list list and set
B
Bruce Momjian 已提交
1793
	 * xmin committed for inserted tuples
1794
	 */
B
Bruce Momjian 已提交
1795
	checked_moved = 0;
B
Bruce Momjian 已提交
1796
	for (i = 0, curpage = vacuum_pages->pagedesc; i < vacuumed_pages; i++, curpage++)
V
Vadim B. Mikheev 已提交
1797
	{
B
Bruce Momjian 已提交
1798 1799
		Assert((*curpage)->blkno < (BlockNumber) blkno);
		buf = ReadBuffer(onerel, (*curpage)->blkno);
1800
		page = BufferGetPage(buf);
B
Bruce Momjian 已提交
1801
		if ((*curpage)->offsets_used == 0)		/* this page was not used */
1802
		{
1803
			if (!PageIsEmpty(page))
B
Bruce Momjian 已提交
1804
				vacuum_page(page, *curpage);
1805
		}
B
Bruce Momjian 已提交
1806 1807
		else
/* this page was used */
1808
		{
B
Bruce Momjian 已提交
1809 1810
			num_tuples = 0;
			max_offset = PageGetMaxOffsetNumber(page);
1811
			for (newoff = FirstOffsetNumber;
B
Bruce Momjian 已提交
1812
				 newoff <= max_offset;
1813 1814 1815 1816 1817
				 newoff = OffsetNumberNext(newoff))
			{
				itemid = PageGetItemId(page, newoff);
				if (!ItemIdIsUsed(itemid))
					continue;
1818
				tuple.t_datamcxt = NULL;
1819
				tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1820
				if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1821
				{
B
Bruce Momjian 已提交
1822
					if ((TransactionId) tuple.t_data->t_cmin != myXID)
1823 1824 1825 1826 1827 1828 1829 1830 1831
						elog(ERROR, "Invalid XID in t_cmin (2)");
					if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
					{
						tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
						num_tuples++;
					}
					else if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
						tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
					else
1832
						elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
1833 1834
				}
			}
B
Bruce Momjian 已提交
1835
			Assert((*curpage)->offsets_used == num_tuples);
B
Bruce Momjian 已提交
1836
			checked_moved += num_tuples;
1837 1838
		}
		WriteBuffer(buf);
V
Vadim B. Mikheev 已提交
1839
	}
B
Bruce Momjian 已提交
1840
	Assert(num_moved == checked_moved);
1841

1842
	elog(MESSAGE_LEVEL, "Rel %s: Pages: %u --> %u; Tuple(s) moved: %u. %s",
1843
		 RelationGetRelationName(onerel),
B
Bruce Momjian 已提交
1844
		 nblocks, blkno, num_moved,
B
Bruce Momjian 已提交
1845
		 show_rusage(&ru0));
1846

B
Bruce Momjian 已提交
1847
	if (Nvacpagelist.num_pages > 0)
V
Vadim B. Mikheev 已提交
1848
	{
1849 1850 1851
		/* vacuum indices again if needed */
		if (Irel != (Relation *) NULL)
		{
B
Bruce Momjian 已提交
1852
			VacPage	   *vpleft,
1853 1854
					   *vpright,
						vpsave;
1855

B
Bruce Momjian 已提交
1856 1857 1858
			/* re-sort Nvacpagelist.pagedesc */
			for (vpleft = Nvacpagelist.pagedesc,
				 vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
1859 1860 1861 1862 1863 1864
				 vpleft < vpright; vpleft++, vpright--)
			{
				vpsave = *vpleft;
				*vpleft = *vpright;
				*vpright = vpsave;
			}
1865
			Assert(keep_tuples >= 0);
1866
			for (i = 0; i < nindices; i++)
B
Bruce Momjian 已提交
1867
				vacuum_index(&Nvacpagelist, Irel[i],
B
Bruce Momjian 已提交
1868
							 vacrelstats->num_tuples, keep_tuples);
1869 1870
		}

B
Bruce Momjian 已提交
1871 1872 1873
		/* clean moved tuples from last page in Nvacpagelist list */
		if (vacpage->blkno == (BlockNumber) (blkno - 1) &&
			vacpage->offsets_free > 0)
1874
		{
B
Bruce Momjian 已提交
1875
			buf = ReadBuffer(onerel, vacpage->blkno);
1876
			page = BufferGetPage(buf);
B
Bruce Momjian 已提交
1877
			num_tuples = 0;
1878
			for (offnum = FirstOffsetNumber;
1879
				 offnum <= maxoff;
1880 1881 1882 1883 1884
				 offnum = OffsetNumberNext(offnum))
			{
				itemid = PageGetItemId(page, offnum);
				if (!ItemIdIsUsed(itemid))
					continue;
1885
				tuple.t_datamcxt = NULL;
1886
				tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1887 1888 1889

				if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
				{
B
Bruce Momjian 已提交
1890
					if ((TransactionId) tuple.t_data->t_cmin != myXID)
1891 1892 1893 1894 1895 1896 1897
						elog(ERROR, "Invalid XID in t_cmin (3)");
					if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
					{
						itemid->lp_flags &= ~LP_USED;
						num_tuples++;
					}
					else
1898
						elog(ERROR, "HEAP_MOVED_OFF was expected (2)");
1899 1900
				}

1901
			}
B
Bruce Momjian 已提交
1902
			Assert(vacpage->offsets_free == num_tuples);
1903 1904 1905 1906
			PageRepairFragmentation(page);
			WriteBuffer(buf);
		}

B
Bruce Momjian 已提交
1907
		/* now - free new list of reaped pages */
B
Bruce Momjian 已提交
1908 1909 1910 1911
		curpage = Nvacpagelist.pagedesc;
		for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
			pfree(*curpage);
		pfree(Nvacpagelist.pagedesc);
V
Vadim B. Mikheev 已提交
1912 1913
	}

1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925
	/*
	 * Flush dirty pages out to disk.  We do this unconditionally, even if
	 * we don't need to truncate, because we want to ensure that all tuples
	 * have correct on-row commit status on disk (see bufmgr.c's comments
	 * for FlushRelationBuffers()).
	 */
	i = FlushRelationBuffers(onerel, blkno);
	if (i < 0)
		elog(ERROR, "VACUUM (repair_frag): FlushRelationBuffers returned %d",
			 i);

	/* truncate relation, if needed */
1926
	if (blkno < nblocks)
V
Vadim B. Mikheev 已提交
1927
	{
B
Bruce Momjian 已提交
1928
		blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
1929
		Assert(blkno >= 0);
1930
		vacrelstats->num_pages = blkno; /* set new number of blocks */
1931 1932 1933 1934
	}

	if (Irel != (Relation *) NULL)		/* pfree index' allocations */
	{
B
Bruce Momjian 已提交
1935
		close_indices(nindices, Irel);
1936
		pfree(indexInfo);
V
Vadim B. Mikheev 已提交
1937 1938
	}

B
Bruce Momjian 已提交
1939
	pfree(vacpage);
1940 1941
	if (vacrelstats->vtlinks != NULL)
		pfree(vacrelstats->vtlinks);
B
Bruce Momjian 已提交
1942
}
V
Vadim B. Mikheev 已提交
1943 1944

/*
B
Bruce Momjian 已提交
1945
 *	vacuum_heap() -- free dead tuples
V
Vadim B. Mikheev 已提交
1946
 *
1947 1948
 *		This routine marks dead tuples as unused and truncates relation
 *		if there are "empty" end-blocks.
V
Vadim B. Mikheev 已提交
1949 1950
 */
static void
B
Bruce Momjian 已提交
1951
vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
V
Vadim B. Mikheev 已提交
1952
{
1953 1954
	Buffer		buf;
	Page		page;
B
Bruce Momjian 已提交
1955
	VacPage    *vacpage;
1956 1957
	int			nblocks;
	int			i;
1958

B
Bruce Momjian 已提交
1959 1960
	nblocks = vacuum_pages->num_pages;
	nblocks -= vacuum_pages->empty_end_pages;		/* nothing to do with
1961
													 * them */
1962

B
Bruce Momjian 已提交
1963
	for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
1964
	{
B
Bruce Momjian 已提交
1965
		if ((*vacpage)->offsets_free > 0)
1966
		{
B
Bruce Momjian 已提交
1967
			buf = ReadBuffer(onerel, (*vacpage)->blkno);
1968
			page = BufferGetPage(buf);
B
Bruce Momjian 已提交
1969
			vacuum_page(page, *vacpage);
1970 1971
			WriteBuffer(buf);
		}
V
Vadim B. Mikheev 已提交
1972 1973
	}

1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987
	/*
	 * Flush dirty pages out to disk.  We do this unconditionally, even if
	 * we don't need to truncate, because we want to ensure that all tuples
	 * have correct on-row commit status on disk (see bufmgr.c's comments
	 * for FlushRelationBuffers()).
	 */
	Assert(vacrelstats->num_pages >= vacuum_pages->empty_end_pages);
	nblocks = vacrelstats->num_pages - vacuum_pages->empty_end_pages;

	i = FlushRelationBuffers(onerel, nblocks);
	if (i < 0)
		elog(ERROR, "VACUUM (vacuum_heap): FlushRelationBuffers returned %d",
			 i);

1988
	/* truncate relation if there are some empty end-pages */
B
Bruce Momjian 已提交
1989
	if (vacuum_pages->empty_end_pages > 0)
1990 1991
	{
		elog(MESSAGE_LEVEL, "Rel %s: Pages: %u --> %u.",
1992
			 RelationGetRelationName(onerel),
B
Bruce Momjian 已提交
1993
			 vacrelstats->num_pages, nblocks);
B
Bruce Momjian 已提交
1994
		nblocks = smgrtruncate(DEFAULT_SMGR, onerel, nblocks);
1995
		Assert(nblocks >= 0);
1996
		vacrelstats->num_pages = nblocks; /* set new number of blocks */
1997
	}
B
Bruce Momjian 已提交
1998
}
V
Vadim B. Mikheev 已提交
1999 2000

/*
B
Bruce Momjian 已提交
2001
 *	vacuum_page() -- free dead tuples on a page
2002
 *					 and repair its fragmentation.
V
Vadim B. Mikheev 已提交
2003 2004
 */
static void
B
Bruce Momjian 已提交
2005
vacuum_page(Page page, VacPage vacpage)
V
Vadim B. Mikheev 已提交
2006
{
2007 2008
	ItemId		itemid;
	int			i;
2009

2010
	/* There shouldn't be any tuples moved onto the page yet! */
B
Bruce Momjian 已提交
2011
	Assert(vacpage->offsets_used == 0);
2012

B
Bruce Momjian 已提交
2013
	for (i = 0; i < vacpage->offsets_free; i++)
V
Vadim B. Mikheev 已提交
2014
	{
B
Bruce Momjian 已提交
2015
		itemid = &(((PageHeader) page)->pd_linp[vacpage->offsets[i] - 1]);
2016
		itemid->lp_flags &= ~LP_USED;
V
Vadim B. Mikheev 已提交
2017
	}
2018
	PageRepairFragmentation(page);
V
Vadim B. Mikheev 已提交
2019

B
Bruce Momjian 已提交
2020
}
2021

2022
/*
B
Bruce Momjian 已提交
2023
 *	_scan_index() -- scan one index relation to update statistic.
2024 2025 2026
 *
 */
static void
B
Bruce Momjian 已提交
2027
scan_index(Relation indrel, int num_tuples)
2028
{
2029
	RetrieveIndexResult res;
2030 2031 2032
	IndexScanDesc iscan;
	int			nitups;
	int			nipages;
2033
	struct rusage ru0;
2034

2035
	getrusage(RUSAGE_SELF, &ru0);
2036

2037 2038 2039
	/* walk through the entire index */
	iscan = index_beginscan(indrel, false, 0, (ScanKey) NULL);
	nitups = 0;
2040

2041 2042 2043 2044 2045 2046
	while ((res = index_getnext(iscan, ForwardScanDirection))
		   != (RetrieveIndexResult) NULL)
	{
		nitups++;
		pfree(res);
	}
2047

2048
	index_endscan(iscan);
2049

2050 2051
	/* now update statistics in pg_class */
	nipages = RelationGetNumberOfBlocks(indrel);
2052
	update_relstats(RelationGetRelid(indrel), nipages, nitups, false, NULL);
2053

2054
	elog(MESSAGE_LEVEL, "Index %s: Pages %u; Tuples %u. %s",
2055
		 RelationGetRelationName(indrel), nipages, nitups,
B
Bruce Momjian 已提交
2056
		 show_rusage(&ru0));
2057

B
Bruce Momjian 已提交
2058
	if (nitups != num_tuples)
2059 2060
		elog(NOTICE, "Index %s: NUMBER OF INDEX' TUPLES (%u) IS NOT THE SAME AS HEAP' (%u).\
\n\tRecreate the index.",
2061
			 RelationGetRelationName(indrel), nitups, num_tuples);
2062

B
Bruce Momjian 已提交
2063
}
2064

2065
/*
B
Bruce Momjian 已提交
2066
 *	vacuum_index() -- vacuum one index relation.
2067
 *
B
Bruce Momjian 已提交
2068
 *		Vpl is the VacPageList of the heap we're currently vacuuming.
2069 2070 2071 2072
 *		It's locked. Indrel is an index relation on the vacuumed heap.
 *		We don't set locks on the index	relation here, since the indexed
 *		access methods support locking at different granularities.
 *		We let them handle it.
2073
 *
2074 2075
 *		Finally, we arrange to update the index relation's statistics in
 *		pg_class.
2076 2077
 */
static void
B
Bruce Momjian 已提交
2078
vacuum_index(VacPageList vacpagelist, Relation indrel, int num_tuples, int keep_tuples)
2079
{
2080
	RetrieveIndexResult res;
2081 2082
	IndexScanDesc iscan;
	ItemPointer heapptr;
B
Bruce Momjian 已提交
2083
	int			tups_vacuumed;
B
Bruce Momjian 已提交
2084 2085
	int			num_index_tuples;
	int			num_pages;
B
Bruce Momjian 已提交
2086
	VacPage		vp;
2087
	struct rusage ru0;
2088 2089 2090 2091 2092

	getrusage(RUSAGE_SELF, &ru0);

	/* walk through the entire index */
	iscan = index_beginscan(indrel, false, 0, (ScanKey) NULL);
B
Bruce Momjian 已提交
2093
	tups_vacuumed = 0;
B
Bruce Momjian 已提交
2094
	num_index_tuples = 0;
2095 2096 2097

	while ((res = index_getnext(iscan, ForwardScanDirection))
		   != (RetrieveIndexResult) NULL)
V
Vadim B. Mikheev 已提交
2098
	{
2099 2100
		heapptr = &res->heap_iptr;

B
Bruce Momjian 已提交
2101
		if ((vp = tid_reaped(heapptr, vacpagelist)) != (VacPage) NULL)
2102
		{
M
 
Marc G. Fournier 已提交
2103
#ifdef NOT_USED
2104 2105 2106 2107 2108
			elog(DEBUG, "<%x,%x> -> <%x,%x>",
				 ItemPointerGetBlockNumber(&(res->index_iptr)),
				 ItemPointerGetOffsetNumber(&(res->index_iptr)),
				 ItemPointerGetBlockNumber(&(res->heap_iptr)),
				 ItemPointerGetOffsetNumber(&(res->heap_iptr)));
2109
#endif
B
Bruce Momjian 已提交
2110
			if (vp->offsets_free == 0)
B
Bruce Momjian 已提交
2111
			{
B
Bruce Momjian 已提交
2112
				elog(NOTICE, "Index %s: pointer to EmptyPage (blk %u off %u) - fixing",
2113
					 RelationGetRelationName(indrel),
B
Bruce Momjian 已提交
2114
					 vp->blkno, ItemPointerGetOffsetNumber(heapptr));
2115
			}
B
Bruce Momjian 已提交
2116
			++tups_vacuumed;
2117 2118 2119
			index_delete(indrel, &res->index_iptr);
		}
		else
B
Bruce Momjian 已提交
2120
			num_index_tuples++;
2121

2122 2123
		pfree(res);
	}
2124

2125
	index_endscan(iscan);
2126

2127
	/* now update statistics in pg_class */
B
Bruce Momjian 已提交
2128
	num_pages = RelationGetNumberOfBlocks(indrel);
2129
	update_relstats(RelationGetRelid(indrel), num_pages, num_index_tuples, false, NULL);
2130

2131
	elog(MESSAGE_LEVEL, "Index %s: Pages %u; Tuples %u: Deleted %u. %s",
2132
		 RelationGetRelationName(indrel), num_pages,
2133
		 num_index_tuples - keep_tuples, tups_vacuumed,
B
Bruce Momjian 已提交
2134
		 show_rusage(&ru0));
2135

2136
	if (num_index_tuples != num_tuples + keep_tuples)
2137 2138
		elog(NOTICE, "Index %s: NUMBER OF INDEX' TUPLES (%u) IS NOT THE SAME AS HEAP' (%u).\
\n\tRecreate the index.",
B
Bruce Momjian 已提交
2139
		  RelationGetRelationName(indrel), num_index_tuples, num_tuples);
V
Vadim B. Mikheev 已提交
2140

B
Bruce Momjian 已提交
2141
}
V
Vadim B. Mikheev 已提交
2142 2143

/*
B
Bruce Momjian 已提交
2144
 *	tid_reaped() -- is a particular tid reaped?
V
Vadim B. Mikheev 已提交
2145
 *
B
Bruce Momjian 已提交
2146
 *		vacpagelist->VacPage_array is sorted in right order.
V
Vadim B. Mikheev 已提交
2147
 */
B
Bruce Momjian 已提交
2148 2149
static VacPage
tid_reaped(ItemPointer itemptr, VacPageList vacpagelist)
V
Vadim B. Mikheev 已提交
2150
{
2151 2152
	OffsetNumber ioffno;
	OffsetNumber *voff;
B
Bruce Momjian 已提交
2153
	VacPage		vp,
2154
			   *vpp;
B
Bruce Momjian 已提交
2155
	VacPageData vacpage;
V
Vadim B. Mikheev 已提交
2156

B
Bruce Momjian 已提交
2157
	vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
2158
	ioffno = ItemPointerGetOffsetNumber(itemptr);
V
Vadim B. Mikheev 已提交
2159

B
Bruce Momjian 已提交
2160 2161 2162
	vp = &vacpage;
	vpp = (VacPage *) vac_find_eq((void *) (vacpagelist->pagedesc),
					vacpagelist->num_pages, sizeof(VacPage), (void *) &vp,
B
Bruce Momjian 已提交
2163
									vac_cmp_blk);
V
Vadim B. Mikheev 已提交
2164

B
Bruce Momjian 已提交
2165 2166
	if (vpp == (VacPage *) NULL)
		return (VacPage) NULL;
2167 2168 2169 2170
	vp = *vpp;

	/* ok - we are on true page */

B
Bruce Momjian 已提交
2171
	if (vp->offsets_free == 0)
2172
	{							/* this is EmptyPage !!! */
2173
		return vp;
2174 2175
	}

B
Bruce Momjian 已提交
2176 2177
	voff = (OffsetNumber *) vac_find_eq((void *) (vp->offsets),
			vp->offsets_free, sizeof(OffsetNumber), (void *) &ioffno,
B
Bruce Momjian 已提交
2178
									   vac_cmp_offno);
2179 2180

	if (voff == (OffsetNumber *) NULL)
B
Bruce Momjian 已提交
2181
		return (VacPage) NULL;
2182

2183
	return vp;
2184

B
Bruce Momjian 已提交
2185
}
2186

2187
/*
2188
 *	update_relstats() -- update statistics for one relation
2189 2190 2191
 *
 *		Statistics are stored in several places: the pg_class row for the
 *		relation has stats about the whole relation, the pg_attribute rows
B
Bruce Momjian 已提交
2192 2193
 *		for each attribute store "dispersion", and there is a pg_statistic
 *		row for each (non-system) attribute.  (Dispersion probably ought to
2194
 *		be moved to pg_statistic, but it's not worth doing unless there's
B
Bruce Momjian 已提交
2195
 *		another reason to have to change pg_attribute.)  Dispersion and
2196 2197
 *		pg_statistic values are only updated by VACUUM ANALYZE, but we
 *		always update the stats in pg_class.
2198
 *
2199 2200
 *		This routine works for both index and heap relation entries in
 *		pg_class.  We violate no-overwrite semantics here by storing new
2201
 *		values for the statistics columns directly into the pg_class
2202
 *		tuple that's already on the page.  The reason for this is that if
2203 2204 2205 2206 2207 2208
 *		we updated these tuples in the usual way, vacuuming pg_class itself
 *		wouldn't work very well --- by the time we got done with a vacuum
 *		cycle, most of the tuples in pg_class would've been obsoleted.
 *		Updating pg_class's own statistics would be especially tricky.
 *		Of course, this only works for fixed-size never-null columns, but
 *		these are.
2209 2210
 */
static void
2211
update_relstats(Oid relid, int num_pages, int num_tuples, bool hasindex,
2212
			VRelStats *vacrelstats)
2213
{
2214
	Relation	rd;
B
Bruce Momjian 已提交
2215
	HeapTupleData rtup;
2216
	HeapTuple	ctup;
B
Bruce Momjian 已提交
2217 2218
	Form_pg_class pgcform;
	Buffer		buffer;
2219

2220 2221 2222
	/*
	 * update number of tuples and number of pages in pg_class
	 */
2223 2224
	rd = heap_openr(RelationRelationName, RowExclusiveLock);

2225 2226 2227
	ctup = SearchSysCache(RELOID,
						  ObjectIdGetDatum(relid),
						  0, 0, 0);
B
Bruce Momjian 已提交
2228
	if (!HeapTupleIsValid(ctup))
2229
		elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
2230
			 relid);
2231

B
Bruce Momjian 已提交
2232
	/* get the buffer cache tuple */
2233
	rtup.t_self = ctup->t_self;
2234
	ReleaseSysCache(ctup);
2235
	heap_fetch(rd, SnapshotNow, &rtup, &buffer);
B
Bruce Momjian 已提交
2236

2237
	/* overwrite the existing statistics in the tuple */
2238
	pgcform = (Form_pg_class) GETSTRUCT(&rtup);
B
Bruce Momjian 已提交
2239 2240
	pgcform->reltuples = num_tuples;
	pgcform->relpages = num_pages;
2241 2242
	pgcform->relhasindex = hasindex;

2243 2244 2245 2246 2247
	/* invalidate the tuple in the cache and write the buffer */
	RelationInvalidateHeapTuple(rd, &rtup);
	WriteBuffer(buffer);

	heap_close(rd, RowExclusiveLock);
2248
}
2249

2250
/*
B
Bruce Momjian 已提交
2251
 *	reap_page() -- save a page on the array of reaped pages.
2252
 *
2253 2254 2255
 *		As a side effect of the way that the vacuuming loop for a given
 *		relation works, higher pages come after lower pages in the array
 *		(and highest tid on a page is last).
2256
 */
2257
static void
B
Bruce Momjian 已提交
2258
reap_page(VacPageList vacpagelist, VacPage vacpage)
2259
{
B
Bruce Momjian 已提交
2260
	VacPage	newvacpage;
2261

B
Bruce Momjian 已提交
2262 2263
	/* allocate a VacPageData entry */
	newvacpage = (VacPage) palloc(sizeof(VacPageData) + vacpage->offsets_free * sizeof(OffsetNumber));
2264

2265
	/* fill it in */
B
Bruce Momjian 已提交
2266 2267 2268 2269 2270 2271
	if (vacpage->offsets_free > 0)
		memmove(newvacpage->offsets, vacpage->offsets, vacpage->offsets_free * sizeof(OffsetNumber));
	newvacpage->blkno = vacpage->blkno;
	newvacpage->free = vacpage->free;
	newvacpage->offsets_used = vacpage->offsets_used;
	newvacpage->offsets_free = vacpage->offsets_free;
2272

B
Bruce Momjian 已提交
2273 2274
	/* insert this page into vacpagelist list */
	vpage_insert(vacpagelist, newvacpage);
2275

B
Bruce Momjian 已提交
2276
}
V
Vadim B. Mikheev 已提交
2277

B
Bruce Momjian 已提交
2278 2279
static void
vpage_insert(VacPageList vacpagelist, VacPage vpnew)
V
Vadim B. Mikheev 已提交
2280
{
T
Tatsuo Ishii 已提交
2281
#define PG_NPAGEDESC 1024
V
Vadim B. Mikheev 已提交
2282

B
Bruce Momjian 已提交
2283 2284
	/* allocate a VacPage entry if needed */
	if (vacpagelist->num_pages == 0)
T
Tatsuo Ishii 已提交
2285
	{
B
Bruce Momjian 已提交
2286 2287
		vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
		vacpagelist->num_allocated_pages = PG_NPAGEDESC;
T
Tatsuo Ishii 已提交
2288
	}
B
Bruce Momjian 已提交
2289
	else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
T
Tatsuo Ishii 已提交
2290
	{
B
Bruce Momjian 已提交
2291 2292
		vacpagelist->num_allocated_pages *= 2;
		vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
T
Tatsuo Ishii 已提交
2293
	}
B
Bruce Momjian 已提交
2294 2295
	vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
	(vacpagelist->num_pages)++;
2296

2297 2298
}

2299
static void *
B
Bruce Momjian 已提交
2300
vac_find_eq(void *bot, int nelem, int size, void *elm,
B
Bruce Momjian 已提交
2301
		   int (*compar) (const void *, const void *))
2302
{
2303 2304 2305 2306 2307
	int			res;
	int			last = nelem - 1;
	int			celm = nelem / 2;
	bool		last_move,
				first_move;
2308 2309 2310

	last_move = first_move = true;
	for (;;)
2311
	{
2312 2313 2314 2315
		if (first_move == true)
		{
			res = compar(bot, elm);
			if (res > 0)
2316
				return NULL;
2317
			if (res == 0)
2318
				return bot;
2319 2320 2321 2322
			first_move = false;
		}
		if (last_move == true)
		{
B
Bruce Momjian 已提交
2323
			res = compar(elm, (void *) ((char *) bot + last * size));
2324
			if (res > 0)
2325
				return NULL;
2326
			if (res == 0)
B
Bruce Momjian 已提交
2327
				return (void *) ((char *) bot + last * size);
2328 2329
			last_move = false;
		}
B
Bruce Momjian 已提交
2330
		res = compar(elm, (void *) ((char *) bot + celm * size));
2331
		if (res == 0)
B
Bruce Momjian 已提交
2332
			return (void *) ((char *) bot + celm * size);
2333 2334 2335
		if (res < 0)
		{
			if (celm == 0)
2336
				return NULL;
2337 2338 2339 2340 2341 2342 2343
			last = celm - 1;
			celm = celm / 2;
			last_move = true;
			continue;
		}

		if (celm == last)
2344
			return NULL;
2345 2346

		last = last - celm - 1;
B
Bruce Momjian 已提交
2347
		bot = (void *) ((char *) bot + (celm + 1) * size);
2348 2349
		celm = (last + 1) / 2;
		first_move = true;
2350
	}
2351

B
Bruce Momjian 已提交
2352
}
2353 2354

static int
B
Bruce Momjian 已提交
2355
vac_cmp_blk(const void *left, const void *right)
2356
{
2357 2358
	BlockNumber lblk,
				rblk;
2359

B
Bruce Momjian 已提交
2360 2361
	lblk = (*((VacPage *) left))->blkno;
	rblk = (*((VacPage *) right))->blkno;
2362

2363
	if (lblk < rblk)
2364
		return -1;
2365
	if (lblk == rblk)
2366 2367
		return 0;
	return 1;
2368

B
Bruce Momjian 已提交
2369
}
2370

2371
static int
B
Bruce Momjian 已提交
2372
vac_cmp_offno(const void *left, const void *right)
2373 2374
{

2375
	if (*(OffsetNumber *) left < *(OffsetNumber *) right)
2376
		return -1;
2377
	if (*(OffsetNumber *) left == *(OffsetNumber *) right)
2378 2379
		return 0;
	return 1;
2380

B
Bruce Momjian 已提交
2381
}
V
Vadim B. Mikheev 已提交
2382

2383
static int
B
Bruce Momjian 已提交
2384
vac_cmp_vtlinks(const void *left, const void *right)
2385 2386
{

B
Bruce Momjian 已提交
2387 2388
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2389
		return -1;
B
Bruce Momjian 已提交
2390 2391
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
		((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2392 2393
		return 1;
	/* bi_hi-es are equal */
B
Bruce Momjian 已提交
2394 2395
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2396
		return -1;
B
Bruce Momjian 已提交
2397 2398
	if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
		((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2399 2400
		return 1;
	/* bi_lo-es are equal */
B
Bruce Momjian 已提交
2401 2402
	if (((VTupleLink) left)->new_tid.ip_posid <
		((VTupleLink) right)->new_tid.ip_posid)
2403
		return -1;
B
Bruce Momjian 已提交
2404 2405
	if (((VTupleLink) left)->new_tid.ip_posid >
		((VTupleLink) right)->new_tid.ip_posid)
2406 2407 2408 2409
		return 1;
	return 0;

}
V
Vadim B. Mikheev 已提交
2410

2411

V
Vadim B. Mikheev 已提交
2412
static void
2413
get_indices(Relation relation, int *nindices, Relation **Irel)
V
Vadim B. Mikheev 已提交
2414
{
2415 2416 2417
	List	   *indexoidlist,
			   *indexoidscan;
	int			i;
V
Vadim B. Mikheev 已提交
2418

2419
	indexoidlist = RelationGetIndexList(relation);
2420

2421
	*nindices = length(indexoidlist);
2422

2423 2424 2425 2426
	if (*nindices > 0)
		*Irel = (Relation *) palloc(*nindices * sizeof(Relation));
	else
		*Irel = NULL;
2427

2428 2429
	i = 0;
	foreach(indexoidscan, indexoidlist)
2430
	{
2431
		Oid			indexoid = lfirsti(indexoidscan);
V
Vadim B. Mikheev 已提交
2432

2433 2434
		(*Irel)[i] = index_open(indexoid);
		i++;
2435 2436
	}

2437
	freeList(indexoidlist);
B
Bruce Momjian 已提交
2438
}
V
Vadim B. Mikheev 已提交
2439 2440 2441


static void
B
Bruce Momjian 已提交
2442
close_indices(int nindices, Relation *Irel)
V
Vadim B. Mikheev 已提交
2443 2444
{

2445 2446
	if (Irel == (Relation *) NULL)
		return;
V
Vadim B. Mikheev 已提交
2447

2448 2449 2450
	while (nindices--)
		index_close(Irel[nindices]);
	pfree(Irel);
V
Vadim B. Mikheev 已提交
2451

B
Bruce Momjian 已提交
2452
}
V
Vadim B. Mikheev 已提交
2453 2454


2455 2456 2457 2458 2459
/*
 * Obtain IndexInfo data for each index on the rel
 */
static IndexInfo **
get_index_desc(Relation onerel, int nindices, Relation *Irel)
V
Vadim B. Mikheev 已提交
2460
{
2461
	IndexInfo **indexInfo;
2462
	int			i;
2463
	HeapTuple	cachetuple;
2464

2465
	indexInfo = (IndexInfo **) palloc(nindices * sizeof(IndexInfo *));
2466

2467
	for (i = 0; i < nindices; i++)
2468
	{
2469
		cachetuple = SearchSysCache(INDEXRELID,
2470
							 ObjectIdGetDatum(RelationGetRelid(Irel[i])),
2471
									0, 0, 0);
2472 2473 2474 2475
		if (!HeapTupleIsValid(cachetuple))
			elog(ERROR, "get_index_desc: index %u not found",
				 RelationGetRelid(Irel[i]));
		indexInfo[i] = BuildIndexInfo(cachetuple);
2476
		ReleaseSysCache(cachetuple);
2477 2478
	}

2479
	return indexInfo;
B
Bruce Momjian 已提交
2480
}
2481 2482


2483
static bool
B
Bruce Momjian 已提交
2484
enough_space(VacPage vacpage, Size len)
V
Vadim B. Mikheev 已提交
2485 2486
{

2487
	len = MAXALIGN(len);
2488

B
Bruce Momjian 已提交
2489
	if (len > vacpage->free)
2490
		return false;
2491

B
Bruce Momjian 已提交
2492
	if (vacpage->offsets_used < vacpage->offsets_free)	/* there are free
2493
														 * itemid(s) */
2494
		return true;			/* and len <= free_space */
2495 2496

	/* ok. noff_usd >= noff_free and so we'll have to allocate new itemid */
B
Bruce Momjian 已提交
2497
	if (len + MAXALIGN(sizeof(ItemIdData)) <= vacpage->free)
2498
		return true;
2499

2500
	return false;
2501

B
Bruce Momjian 已提交
2502
}
2503 2504 2505 2506 2507 2508 2509 2510 2511


/*
 * Compute elapsed time since ru0 usage snapshot, and format into
 * a displayable string.  Result is in a static string, which is
 * tacky, but no one ever claimed that the Postgres backend is
 * threadable...
 */
static char *
B
Bruce Momjian 已提交
2512
show_rusage(struct rusage * ru0)
2513
{
B
Bruce Momjian 已提交
2514 2515
	static char result[64];
	struct rusage ru1;
2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534

	getrusage(RUSAGE_SELF, &ru1);

	if (ru1.ru_stime.tv_usec < ru0->ru_stime.tv_usec)
	{
		ru1.ru_stime.tv_sec--;
		ru1.ru_stime.tv_usec += 1000000;
	}
	if (ru1.ru_utime.tv_usec < ru0->ru_utime.tv_usec)
	{
		ru1.ru_utime.tv_sec--;
		ru1.ru_utime.tv_usec += 1000000;
	}

	snprintf(result, sizeof(result),
			 "CPU %d.%02ds/%d.%02du sec.",
			 (int) (ru1.ru_stime.tv_sec - ru0->ru_stime.tv_sec),
			 (int) (ru1.ru_stime.tv_usec - ru0->ru_stime.tv_usec) / 10000,
			 (int) (ru1.ru_utime.tv_sec - ru0->ru_utime.tv_sec),
B
Bruce Momjian 已提交
2535
		   (int) (ru1.ru_utime.tv_usec - ru0->ru_utime.tv_usec) / 10000);
2536 2537 2538

	return result;
}