analyze.c 51.6 KB
Newer Older
B
Bruce Momjian 已提交
1 2 3
/*-------------------------------------------------------------------------
 *
 * analyze.c
4
 *	  the postgres statistics generator
B
Bruce Momjian 已提交
5
 *
B
Bruce Momjian 已提交
6
 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
B
Bruce Momjian 已提交
7 8 9 10
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
11
 *	  $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.63 2003/09/25 06:57:58 petere Exp $
B
Bruce Momjian 已提交
12 13 14
 *
 *-------------------------------------------------------------------------
 */
15 16
#include "postgres.h"

17
#include <math.h>
B
Bruce Momjian 已提交
18 19

#include "access/heapam.h"
20
#include "access/tuptoaster.h"
21
#include "catalog/catalog.h"
B
Bruce Momjian 已提交
22 23
#include "catalog/catname.h"
#include "catalog/indexing.h"
24
#include "catalog/namespace.h"
B
Bruce Momjian 已提交
25 26 27 28 29 30
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
#include "catalog/pg_type.h"
#include "commands/vacuum.h"
#include "miscadmin.h"
#include "parser/parse_oper.h"
31
#include "parser/parse_relation.h"
B
Bruce Momjian 已提交
32 33
#include "utils/acl.h"
#include "utils/builtins.h"
34
#include "utils/datum.h"
B
Bruce Momjian 已提交
35
#include "utils/fmgroids.h"
36
#include "utils/lsyscache.h"
B
Bruce Momjian 已提交
37
#include "utils/syscache.h"
38
#include "utils/tuplesort.h"
B
Bruce Momjian 已提交
39 40


41 42 43
/*
 * Analysis algorithms supported
 */
44 45
typedef enum
{
46
	ALG_MINIMAL = 1,			/* Compute only most-common-values */
47 48
	ALG_SCALAR					/* Compute MCV, histogram, sort
								 * correlation */
49 50 51 52 53 54 55 56 57 58 59
} AlgCode;

/*
 * To avoid consuming too much memory during analysis and/or too much space
 * in the resulting pg_statistic rows, we ignore varlena datums that are wider
 * than WIDTH_THRESHOLD (after detoasting!).  This is legitimate for MCV
 * and distinct-value calculations since a wide value is unlikely to be
 * duplicated at all, much less be a most-common value.  For the same reason,
 * ignoring wide values will not affect our estimates of histogram bin
 * boundaries very much.
 */
60
#define WIDTH_THRESHOLD  1024
61 62 63

/*
 * We build one of these structs for each attribute (column) that is to be
64
 * analyzed.  The struct and subsidiary data are in anl_context,
65 66 67 68 69 70 71
 * so they live until the end of the ANALYZE operation.
 */
typedef struct
{
	/* These fields are set up by examine_attribute */
	int			attnum;			/* attribute number */
	AlgCode		algcode;		/* Which algorithm to use for this column */
72
	int			minrows;		/* Minimum # of rows wanted for stats */
73 74 75 76 77 78
	Form_pg_attribute attr;		/* copy of pg_attribute row for column */
	Form_pg_type attrtype;		/* copy of pg_type row for column */
	Oid			eqopr;			/* '=' operator for datatype, if any */
	Oid			eqfunc;			/* and associated function */
	Oid			ltopr;			/* '<' operator for datatype, if any */

79 80 81 82
	/*
	 * These fields are filled in by the actual statistics-gathering
	 * routine
	 */
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
	bool		stats_valid;
	float4		stanullfrac;	/* fraction of entries that are NULL */
	int4		stawidth;		/* average width */
	float4		stadistinct;	/* # distinct values */
	int2		stakind[STATISTIC_NUM_SLOTS];
	Oid			staop[STATISTIC_NUM_SLOTS];
	int			numnumbers[STATISTIC_NUM_SLOTS];
	float4	   *stanumbers[STATISTIC_NUM_SLOTS];
	int			numvalues[STATISTIC_NUM_SLOTS];
	Datum	   *stavalues[STATISTIC_NUM_SLOTS];
} VacAttrStats;


typedef struct
{
	Datum		value;			/* a data value */
	int			tupno;			/* position index for tuple it came from */
} ScalarItem;

typedef struct
{
	int			count;			/* # of duplicates */
	int			first;			/* values[] index of first occurrence */
} ScalarMCVItem;


109 110
#define swapInt(a,b)	do {int _tmp; _tmp=a; a=b; b=_tmp;} while(0)
#define swapDatum(a,b)	do {Datum _tmp; _tmp=a; a=b; b=_tmp;} while(0)
B
Bruce Momjian 已提交
111

112 113

/* Default statistics target (GUC parameter) */
B
Bruce Momjian 已提交
114
int			default_statistics_target = 10;
115 116


B
Bruce Momjian 已提交
117
static int	elevel = -1;
118

119 120
static MemoryContext anl_context = NULL;

121 122 123 124 125 126 127 128
/* context information for compare_scalars() */
static FmgrInfo *datumCmpFn;
static SortFunctionKind datumCmpFnKind;
static int *datumCmpTupnoLink;


static VacAttrStats *examine_attribute(Relation onerel, int attnum);
static int acquire_sample_rows(Relation onerel, HeapTuple *rows,
129
					int targrows, double *totalrows);
130 131
static double random_fract(void);
static double init_selection_state(int n);
132
static double select_next_random_record(double t, int n, double *stateptr);
133 134 135
static int	compare_rows(const void *a, const void *b);
static int	compare_scalars(const void *a, const void *b);
static int	compare_mcvs(const void *a, const void *b);
136
static void compute_minimal_stats(VacAttrStats *stats,
137 138
					  TupleDesc tupDesc, double totalrows,
					  HeapTuple *rows, int numrows);
139
static void compute_scalar_stats(VacAttrStats *stats,
140 141
					 TupleDesc tupDesc, double totalrows,
					 HeapTuple *rows, int numrows);
142
static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats);
B
Bruce Momjian 已提交
143 144 145


/*
146
 *	analyze_rel() -- analyze one relation
B
Bruce Momjian 已提交
147 148
 */
void
149
analyze_rel(Oid relid, VacuumStmt *vacstmt)
B
Bruce Momjian 已提交
150 151
{
	Relation	onerel;
152 153 154 155 156 157
	int			attr_cnt,
				tcnt,
				i;
	VacAttrStats **vacattrstats;
	int			targrows,
				numrows;
158
	double		totalrows;
159 160 161
	HeapTuple  *rows;

	if (vacstmt->verbose)
162
		elevel = INFO;
163
	else
164
		elevel = DEBUG2;
B
Bruce Momjian 已提交
165

166
	/*
B
Bruce Momjian 已提交
167 168 169
	 * Use the current context for storing analysis info.  vacuum.c
	 * ensures that this context will be cleared when I return, thus
	 * releasing the memory allocated here.
170 171 172
	 */
	anl_context = CurrentMemoryContext;

B
Bruce Momjian 已提交
173 174
	/*
	 * Check for user-requested abort.	Note we want this to be inside a
B
Bruce Momjian 已提交
175
	 * transaction, so xact.c doesn't issue useless WARNING.
B
Bruce Momjian 已提交
176
	 */
177
	CHECK_FOR_INTERRUPTS();
B
Bruce Momjian 已提交
178 179 180

	/*
	 * Race condition -- if the pg_class tuple has gone away since the
181
	 * last time we saw it, we don't need to process it.
B
Bruce Momjian 已提交
182
	 */
183 184 185
	if (!SearchSysCacheExists(RELOID,
							  ObjectIdGetDatum(relid),
							  0, 0, 0))
186
		return;
B
Bruce Momjian 已提交
187

B
Bruce Momjian 已提交
188
	/*
189 190
	 * Open the class, getting only a read lock on it, and check
	 * permissions. Permissions check should match vacuum's check!
B
Bruce Momjian 已提交
191
	 */
192 193 194
	onerel = relation_open(relid, AccessShareLock);

	if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) ||
195
		  (pg_database_ownercheck(MyDatabaseId, GetUserId()) && !onerel->rd_rel->relisshared)))
B
Bruce Momjian 已提交
196
	{
197 198
		/* No need for a WARNING if we already complained during VACUUM */
		if (!vacstmt->vacuum)
199
			ereport(WARNING,
200
					(errmsg("skipping \"%s\" --- only table or database owner can analyze it",
201
							RelationGetRelationName(onerel))));
202
		relation_close(onerel, AccessShareLock);
B
Bruce Momjian 已提交
203 204 205
		return;
	}

206
	/*
207 208
	 * Check that it's a plain table; we used to do this in getrels() but
	 * seems safer to check after we've locked the relation.
209
	 */
210
	if (onerel->rd_rel->relkind != RELKIND_RELATION)
B
Bruce Momjian 已提交
211
	{
B
Bruce Momjian 已提交
212
		/* No need for a WARNING if we already complained during VACUUM */
213
		if (!vacstmt->vacuum)
214
			ereport(WARNING,
215
					(errmsg("skipping \"%s\" --- cannot analyze indexes, views, or special system tables",
216
							RelationGetRelationName(onerel))));
217 218 219 220
		relation_close(onerel, AccessShareLock);
		return;
	}

221 222
	/*
	 * Silently ignore tables that are temp tables of other backends ---
B
Bruce Momjian 已提交
223 224 225
	 * trying to analyze these is rather pointless, since their contents
	 * are probably not up-to-date on disk.  (We don't throw a warning
	 * here; it would just lead to chatter during a database-wide
226 227 228 229 230 231 232 233
	 * ANALYZE.)
	 */
	if (isOtherTempNamespace(RelationGetNamespace(onerel)))
	{
		relation_close(onerel, AccessShareLock);
		return;
	}

234 235 236
	/*
	 * We can ANALYZE any table except pg_statistic. See update_attstats
	 */
237
	if (IsSystemNamespace(RelationGetNamespace(onerel)) &&
B
Bruce Momjian 已提交
238
	 strcmp(RelationGetRelationName(onerel), StatisticRelationName) == 0)
239 240
	{
		relation_close(onerel, AccessShareLock);
B
Bruce Momjian 已提交
241 242 243
		return;
	}

244 245 246 247
	ereport(elevel,
			(errmsg("analyzing \"%s.%s\"",
					get_namespace_name(RelationGetNamespace(onerel)),
					RelationGetRelationName(onerel))));
B
Bruce Momjian 已提交
248

249 250 251 252 253 254
	/*
	 * Determine which columns to analyze
	 *
	 * Note that system attributes are never analyzed.
	 */
	if (vacstmt->va_cols != NIL)
B
Bruce Momjian 已提交
255 256 257
	{
		List	   *le;

258 259 260 261
		vacattrstats = (VacAttrStats **) palloc(length(vacstmt->va_cols) *
												sizeof(VacAttrStats *));
		tcnt = 0;
		foreach(le, vacstmt->va_cols)
B
Bruce Momjian 已提交
262
		{
263
			char	   *col = strVal(lfirst(le));
B
Bruce Momjian 已提交
264

265 266
			i = attnameAttNum(onerel, col, false);
			vacattrstats[tcnt] = examine_attribute(onerel, i);
267 268 269 270 271 272 273
			if (vacattrstats[tcnt] != NULL)
				tcnt++;
		}
		attr_cnt = tcnt;
	}
	else
	{
274
		attr_cnt = onerel->rd_att->natts;
275 276
		/* +1 here is just to avoid palloc(0) with zero-column table */
		vacattrstats = (VacAttrStats **) palloc((attr_cnt + 1) *
277 278
												sizeof(VacAttrStats *));
		tcnt = 0;
279
		for (i = 1; i <= attr_cnt; i++)
280
		{
281
			vacattrstats[tcnt] = examine_attribute(onerel, i);
282 283
			if (vacattrstats[tcnt] != NULL)
				tcnt++;
B
Bruce Momjian 已提交
284 285 286 287
		}
		attr_cnt = tcnt;
	}

288 289 290 291 292
	/*
	 * Quit if no analyzable columns
	 */
	if (attr_cnt <= 0)
	{
293
		relation_close(onerel, AccessShareLock);
294 295
		return;
	}
B
Bruce Momjian 已提交
296

297 298
	/*
	 * Determine how many rows we need to sample, using the worst case
299 300
	 * from all analyzable columns.  We use a lower bound of 100 rows to
	 * avoid possible overflow in Vitter's algorithm.
301 302
	 */
	targrows = 100;
B
Bruce Momjian 已提交
303 304
	for (i = 0; i < attr_cnt; i++)
	{
305 306 307 308 309 310 311 312 313
		if (targrows < vacattrstats[i]->minrows)
			targrows = vacattrstats[i]->minrows;
	}

	/*
	 * Acquire the sample rows
	 */
	rows = (HeapTuple *) palloc(targrows * sizeof(HeapTuple));
	numrows = acquire_sample_rows(onerel, rows, targrows, &totalrows);
B
Bruce Momjian 已提交
314

315 316 317
	/*
	 * If we are running a standalone ANALYZE, update pages/tuples stats
	 * in pg_class.  We have the accurate page count from heap_beginscan,
318 319
	 * but only an approximate number of tuples; therefore, if we are part
	 * of VACUUM ANALYZE do *not* overwrite the accurate count already
320 321 322 323 324
	 * inserted by VACUUM.
	 */
	if (!vacstmt->vacuum)
		vac_update_relstats(RelationGetRelid(onerel),
							onerel->rd_nblocks,
325
							totalrows,
326 327 328
							RelationGetForm(onerel)->relhasindex);

	/*
329
	 * Compute the statistics.	Temporary results during the calculations
330 331
	 * for each column are stored in a child context.  The calc routines
	 * are responsible to make sure that whatever they store into the
332
	 * VacAttrStats structure is allocated in anl_context.
333 334 335 336 337 338
	 */
	if (numrows > 0)
	{
		MemoryContext col_context,
					old_context;

339
		col_context = AllocSetContextCreate(anl_context,
340 341 342 343 344 345
											"Analyze Column",
											ALLOCSET_DEFAULT_MINSIZE,
											ALLOCSET_DEFAULT_INITSIZE,
											ALLOCSET_DEFAULT_MAXSIZE);
		old_context = MemoryContextSwitchTo(col_context);
		for (i = 0; i < attr_cnt; i++)
B
Bruce Momjian 已提交
346
		{
347 348 349 350 351 352 353 354 355 356 357 358 359 360
			switch (vacattrstats[i]->algcode)
			{
				case ALG_MINIMAL:
					compute_minimal_stats(vacattrstats[i],
										  onerel->rd_att, totalrows,
										  rows, numrows);
					break;
				case ALG_SCALAR:
					compute_scalar_stats(vacattrstats[i],
										 onerel->rd_att, totalrows,
										 rows, numrows);
					break;
			}
			MemoryContextResetAndDeleteChildren(col_context);
B
Bruce Momjian 已提交
361
		}
362 363 364 365 366
		MemoryContextSwitchTo(old_context);
		MemoryContextDelete(col_context);

		/*
		 * Emit the completed stats rows into pg_statistic, replacing any
367 368 369
		 * previous statistics for the target columns.	(If there are
		 * stats in pg_statistic for columns we didn't process, we leave
		 * them alone.)
370 371 372 373 374 375 376 377 378
		 */
		update_attstats(relid, attr_cnt, vacattrstats);
	}

	/*
	 * Close source relation now, but keep lock so that no one deletes it
	 * before we commit.  (If someone did, they'd fail to clean up the
	 * entries we made in pg_statistic.)
	 */
379
	relation_close(onerel, NoLock);
380 381 382 383 384 385 386 387 388 389 390
}

/*
 * examine_attribute -- pre-analysis of a single column
 *
 * Determine whether the column is analyzable; if so, create and initialize
 * a VacAttrStats struct for it.  If not, return NULL.
 */
static VacAttrStats *
examine_attribute(Relation onerel, int attnum)
{
391
	Form_pg_attribute attr = onerel->rd_att->attrs[attnum - 1];
392 393 394 395 396 397 398
	Operator	func_operator;
	HeapTuple	typtuple;
	Oid			eqopr = InvalidOid;
	Oid			eqfunc = InvalidOid;
	Oid			ltopr = InvalidOid;
	VacAttrStats *stats;

399 400 401 402
	/* Don't analyze dropped columns */
	if (attr->attisdropped)
		return NULL;

403
	/* Don't analyze column if user has specified not to */
404
	if (attr->attstattarget == 0)
405 406 407
		return NULL;

	/* If column has no "=" operator, we can't do much of anything */
408
	func_operator = equality_oper(attr->atttypid, true);
409 410
	if (func_operator != NULL)
	{
411 412
		eqopr = oprid(func_operator);
		eqfunc = oprfuncid(func_operator);
413 414 415 416
		ReleaseSysCache(func_operator);
	}
	if (!OidIsValid(eqfunc))
		return NULL;
B
Bruce Momjian 已提交
417

418
	/*
419 420
	 * If we have "=" then we're at least able to do the minimal
	 * algorithm, so start filling in a VacAttrStats struct.
421
	 */
422
	stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
423 424 425 426 427 428 429
	stats->attnum = attnum;
	stats->attr = (Form_pg_attribute) palloc(ATTRIBUTE_TUPLE_SIZE);
	memcpy(stats->attr, attr, ATTRIBUTE_TUPLE_SIZE);
	typtuple = SearchSysCache(TYPEOID,
							  ObjectIdGetDatum(attr->atttypid),
							  0, 0, 0);
	if (!HeapTupleIsValid(typtuple))
430
		elog(ERROR, "cache lookup failed for type %u", attr->atttypid);
431 432 433 434 435 436
	stats->attrtype = (Form_pg_type) palloc(sizeof(FormData_pg_type));
	memcpy(stats->attrtype, GETSTRUCT(typtuple), sizeof(FormData_pg_type));
	ReleaseSysCache(typtuple);
	stats->eqopr = eqopr;
	stats->eqfunc = eqfunc;

437 438 439 440
	/* If the attstattarget column is negative, use the default value */
	if (stats->attr->attstattarget < 0)
		stats->attr->attstattarget = default_statistics_target;

441
	/* Is there a "<" operator with suitable semantics? */
442
	func_operator = ordering_oper(attr->atttypid, true);
443 444
	if (func_operator != NULL)
	{
445
		ltopr = oprid(func_operator);
446 447 448 449 450
		ReleaseSysCache(func_operator);
	}
	stats->ltopr = ltopr;

	/*
451 452
	 * Determine the algorithm to use (this will get more complicated
	 * later)
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
	 */
	if (OidIsValid(ltopr))
	{
		/* Seems to be a scalar datatype */
		stats->algcode = ALG_SCALAR;
		/*--------------------
		 * The following choice of minrows is based on the paper
		 * "Random sampling for histogram construction: how much is enough?"
		 * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
		 * Proceedings of ACM SIGMOD International Conference on Management
		 * of Data, 1998, Pages 436-447.  Their Corollary 1 to Theorem 5
		 * says that for table size n, histogram size k, maximum relative
		 * error in bin size f, and error probability gamma, the minimum
		 * random sample size is
		 *		r = 4 * k * ln(2*n/gamma) / f^2
		 * Taking f = 0.5, gamma = 0.01, n = 1 million rows, we obtain
		 *		r = 305.82 * k
		 * Note that because of the log function, the dependence on n is
		 * quite weak; even at n = 1 billion, a 300*k sample gives <= 0.59
		 * bin size error with probability 0.99.  So there's no real need to
		 * scale for n, which is a good thing because we don't necessarily
		 * know it at this point.
		 *--------------------
		 */
477
		stats->minrows = 300 * stats->attr->attstattarget;
478 479 480 481 482 483
	}
	else
	{
		/* Can't do much but the minimal stuff */
		stats->algcode = ALG_MINIMAL;
		/* Might as well use the same minrows as above */
484
		stats->minrows = 300 * stats->attr->attstattarget;
485 486 487 488
	}

	return stats;
}
B
Bruce Momjian 已提交
489

490 491 492 493
/*
 * acquire_sample_rows -- acquire a random sample of rows from the table
 *
 * Up to targrows rows are collected (if there are fewer than that many
494
 * rows in the table, all rows are collected).	When the table is larger
495 496 497 498 499 500 501 502 503 504 505
 * than targrows, a truly random sample is collected: every row has an
 * equal chance of ending up in the final sample.
 *
 * We also estimate the total number of rows in the table, and return that
 * into *totalrows.
 *
 * The returned list of tuples is in order by physical position in the table.
 * (We will rely on this later to derive correlation estimates.)
 */
static int
acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows,
506
					double *totalrows)
507 508 509 510
{
	int			numrows = 0;
	HeapScanDesc scan;
	HeapTuple	tuple;
511 512
	ItemPointer lasttuple;
	BlockNumber lastblock,
513 514 515 516
				estblock;
	OffsetNumber lastoffset;
	int			numest;
	double		tuplesperpage;
517
	double		t;
518 519 520
	double		rstate;

	Assert(targrows > 1);
521

522 523 524
	/*
	 * Do a simple linear scan until we reach the target number of rows.
	 */
525 526
	scan = heap_beginscan(onerel, SnapshotNow, 0, NULL);
	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
527 528 529 530
	{
		rows[numrows++] = heap_copytuple(tuple);
		if (numrows >= targrows)
			break;
531
		CHECK_FOR_INTERRUPTS();
532 533
	}
	heap_endscan(scan);
534

535
	/*
536
	 * If we ran out of tuples then we're done, no matter how few we
537 538 539 540
	 * collected.  No sort is needed, since they're already in order.
	 */
	if (!HeapTupleIsValid(tuple))
	{
541
		*totalrows = (double) numrows;
542 543 544 545 546 547

		ereport(elevel,
				(errmsg("\"%s\": %u pages, %d rows sampled, %.0f estimated total rows",
						RelationGetRelationName(onerel),
						onerel->rd_nblocks, numrows, *totalrows)));

548 549
		return numrows;
	}
550

551 552 553
	/*
	 * Otherwise, start replacing tuples in the sample until we reach the
	 * end of the relation.  This algorithm is from Jeff Vitter's paper
554 555 556 557 558 559
	 * (see full citation below).  It works by repeatedly computing the
	 * number of the next tuple we want to fetch, which will replace a
	 * randomly chosen element of the reservoir (current set of tuples).
	 * At all times the reservoir is a true random sample of the tuples
	 * we've passed over so far, so when we fall off the end of the
	 * relation we're done.
560
	 *
561 562 563 564 565 566 567 568 569 570 571
	 * A slight difficulty is that since we don't want to fetch tuples or
	 * even pages that we skip over, it's not possible to fetch *exactly*
	 * the N'th tuple at each step --- we don't know how many valid tuples
	 * are on the skipped pages.  We handle this by assuming that the
	 * average number of valid tuples/page on the pages already scanned
	 * over holds good for the rest of the relation as well; this lets us
	 * estimate which page the next tuple should be on and its position in
	 * the page.  Then we fetch the first valid tuple at or after that
	 * position, being careful not to use the same tuple twice.  This
	 * approach should still give a good random sample, although it's not
	 * perfect.
572
	 */
573
	lasttuple = &(rows[numrows - 1]->t_self);
574 575
	lastblock = ItemPointerGetBlockNumber(lasttuple);
	lastoffset = ItemPointerGetOffsetNumber(lasttuple);
576

577
	/*
578 579
	 * If possible, estimate tuples/page using only completely-scanned
	 * pages.
580 581 582
	 */
	for (numest = numrows; numest > 0; numest--)
	{
583
		if (ItemPointerGetBlockNumber(&(rows[numest - 1]->t_self)) != lastblock)
584 585 586 587 588 589 590 591 592 593 594
			break;
	}
	if (numest == 0)
	{
		numest = numrows;		/* don't have a full page? */
		estblock = lastblock + 1;
	}
	else
		estblock = lastblock;
	tuplesperpage = (double) numest / (double) estblock;

595
	t = (double) numrows;		/* t is the # of records processed so far */
596 597 598
	rstate = init_selection_state(targrows);
	for (;;)
	{
599 600 601 602 603 604
		double		targpos;
		BlockNumber targblock;
		Buffer		targbuffer;
		Page		targpage;
		OffsetNumber targoffset,
					maxoffset;
605

606 607
		CHECK_FOR_INTERRUPTS();

608 609
		t = select_next_random_record(t, targrows, &rstate);
		/* Try to read the t'th record in the table */
610
		targpos = t / tuplesperpage;
611
		targblock = (BlockNumber) targpos;
612
		targoffset = ((int) ((targpos - targblock) * tuplesperpage)) +
613 614 615
			FirstOffsetNumber;
		/* Make sure we are past the last selected record */
		if (targblock <= lastblock)
B
Bruce Momjian 已提交
616
		{
617 618 619
			targblock = lastblock;
			if (targoffset <= lastoffset)
				targoffset = lastoffset + 1;
B
Bruce Momjian 已提交
620
		}
621
		/* Loop to find first valid record at or after given position */
622 623
pageloop:;

624
		/*
625
		 * Have we fallen off the end of the relation?	(We rely on
626 627 628 629
		 * heap_beginscan to have updated rd_nblocks.)
		 */
		if (targblock >= onerel->rd_nblocks)
			break;
630

631
		/*
632 633 634 635 636 637
		 * We must maintain a pin on the target page's buffer to ensure
		 * that the maxoffset value stays good (else concurrent VACUUM
		 * might delete tuples out from under us).	Hence, pin the page
		 * until we are done looking at it.  We don't maintain a lock on
		 * the page, so tuples could get added to it, but we ignore such
		 * tuples.
638 639 640
		 */
		targbuffer = ReadBuffer(onerel, targblock);
		if (!BufferIsValid(targbuffer))
641
			elog(ERROR, "ReadBuffer failed");
642 643 644 645 646
		LockBuffer(targbuffer, BUFFER_LOCK_SHARE);
		targpage = BufferGetPage(targbuffer);
		maxoffset = PageGetMaxOffsetNumber(targpage);
		LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK);

647
		for (;;)
B
Bruce Momjian 已提交
648
		{
649
			HeapTupleData targtuple;
650
			Buffer		tupbuffer;
651 652 653 654

			if (targoffset > maxoffset)
			{
				/* Fell off end of this page, try next */
655
				ReleaseBuffer(targbuffer);
656 657 658 659 660
				targblock++;
				targoffset = FirstOffsetNumber;
				goto pageloop;
			}
			ItemPointerSet(&targtuple.t_self, targblock, targoffset);
661 662
			if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer,
						   false, NULL))
663 664 665 666 667
			{
				/*
				 * Found a suitable tuple, so save it, replacing one old
				 * tuple at random
				 */
668
				int			k = (int) (targrows * random_fract());
669 670 671 672

				Assert(k >= 0 && k < targrows);
				heap_freetuple(rows[k]);
				rows[k] = heap_copytuple(&targtuple);
673 674 675
				/* this releases the second pin acquired by heap_fetch: */
				ReleaseBuffer(tupbuffer);
				/* this releases the initial pin: */
676 677 678 679 680 681 682
				ReleaseBuffer(targbuffer);
				lastblock = targblock;
				lastoffset = targoffset;
				break;
			}
			/* this tuple is dead, so advance to next one on same page */
			targoffset++;
B
Bruce Momjian 已提交
683 684 685
		}
	}

686 687 688 689
	/*
	 * Now we need to sort the collected tuples by position (itempointer).
	 */
	qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
B
Bruce Momjian 已提交
690

691 692 693
	/*
	 * Estimate total number of valid rows in relation.
	 */
694
	*totalrows = floor((double) onerel->rd_nblocks * tuplesperpage + 0.5);
B
Bruce Momjian 已提交
695

696 697 698
	/*
	 * Emit some interesting relation info 
	 */
699
	ereport(elevel,
700
			(errmsg("\"%s\": %u pages, %d rows sampled, %.0f estimated total rows",
701
					RelationGetRelationName(onerel),
702
					onerel->rd_nblocks, numrows, *totalrows)));
703

704 705
	return numrows;
}
B
Bruce Momjian 已提交
706

707 708 709 710
/* Select a random value R uniformly distributed in 0 < R < 1 */
static double
random_fract(void)
{
711
	long		z;
B
Bruce Momjian 已提交
712

713 714 715 716
	/* random() can produce endpoint values, try again if so */
	do
	{
		z = random();
717
	} while (!(z > 0 && z < MAX_RANDOM_VALUE));
718
	return (double) z / (double) MAX_RANDOM_VALUE;
B
Bruce Momjian 已提交
719 720 721
}

/*
722 723 724 725 726 727 728
 * These two routines embody Algorithm Z from "Random sampling with a
 * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
 * (Mar. 1985), Pages 37-57.  While Vitter describes his algorithm in terms
 * of the count S of records to skip before processing another record,
 * it is convenient to work primarily with t, the index (counting from 1)
 * of the last record processed and next record to process.  The only extra
 * state needed between calls is W, a random state variable.
B
Bruce Momjian 已提交
729
 *
730 731 732 733 734 735
 * Note: the original algorithm defines t, S, numer, and denom as integers.
 * Here we express them as doubles to avoid overflow if the number of rows
 * in the table exceeds INT_MAX.  The algorithm should work as long as the
 * row count does not become so large that it is not represented accurately
 * in a double (on IEEE-math machines this would be around 2^52 rows).
 *
736
 * init_selection_state computes the initial W value.
B
Bruce Momjian 已提交
737
 *
738 739 740 741 742 743 744 745
 * Given that we've already processed t records (t >= n),
 * select_next_random_record determines the number of the next record to
 * process.
 */
static double
init_selection_state(int n)
{
	/* Initial value of W (for use when Algorithm Z is first applied) */
746
	return exp(-log(random_fract()) / n);
747 748
}

749 750
static double
select_next_random_record(double t, int n, double *stateptr)
751 752
{
	/* The magic constant here is T from Vitter's paper */
753
	if (t <= (22.0 * n))
754 755
	{
		/* Process records using Algorithm X until t is large enough */
756 757
		double		V,
					quot;
758 759

		V = random_fract();		/* Generate V */
760 761
		t += 1;
		quot = (t - (double) n) / t;
762 763 764
		/* Find min S satisfying (4.1) */
		while (quot > V)
		{
765 766
			t += 1;
			quot *= (t - (double) n) / t;
767 768 769 770 771
		}
	}
	else
	{
		/* Now apply Algorithm Z */
772 773 774
		double		W = *stateptr;
		double		term = t - (double) n + 1;
		double		S;
775 776 777

		for (;;)
		{
778 779 780 781 782 783 784 785 786
			double		numer,
						numer_lim,
						denom;
			double		U,
						X,
						lhs,
						rhs,
						y,
						tmp;
787 788 789 790

			/* Generate U and X */
			U = random_fract();
			X = t * (W - 1.0);
791
			S = floor(X);		/* S is tentatively set to floor(X) */
792
			/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
793
			tmp = (t + 1) / term;
794 795
			lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
			rhs = (((t + X) / (term + S)) * term) / t;
796 797
			if (lhs <= rhs)
			{
798
				W = rhs / lhs;
799 800 801
				break;
			}
			/* Test if U <= f(S)/cg(X) */
802
			y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
803
			if ((double) n < S)
804 805 806 807 808 809
			{
				denom = t;
				numer_lim = term + S;
			}
			else
			{
810
				denom = t - (double) n + S;
811 812
				numer_lim = t + 1;
			}
813
			for (numer = t + S; numer >= numer_lim; numer -= 1)
814
			{
815 816
				y *= numer / denom;
				denom -= 1;
817
			}
818 819
			W = exp(-log(random_fract()) / n);	/* Generate W in advance */
			if (exp(log(y) / n) <= (t + X) / t)
820 821 822 823 824 825 826 827 828 829 830 831 832 833
				break;
		}
		t += S + 1;
		*stateptr = W;
	}
	return t;
}

/*
 * qsort comparator for sorting rows[] array
 */
static int
compare_rows(const void *a, const void *b)
{
834 835 836
	HeapTuple	ha = *(HeapTuple *) a;
	HeapTuple	hb = *(HeapTuple *) b;
	BlockNumber ba = ItemPointerGetBlockNumber(&ha->t_self);
837
	OffsetNumber oa = ItemPointerGetOffsetNumber(&ha->t_self);
838
	BlockNumber bb = ItemPointerGetBlockNumber(&hb->t_self);
839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854
	OffsetNumber ob = ItemPointerGetOffsetNumber(&hb->t_self);

	if (ba < bb)
		return -1;
	if (ba > bb)
		return 1;
	if (oa < ob)
		return -1;
	if (oa > ob)
		return 1;
	return 0;
}


/*
 *	compute_minimal_stats() -- compute minimal column statistics
B
Bruce Momjian 已提交
855
 *
856
 *	We use this when we can find only an "=" operator for the datatype.
B
Bruce Momjian 已提交
857
 *
858 859
 *	We determine the fraction of non-null rows, the average width, the
 *	most common values, and the (estimated) number of distinct values.
B
Bruce Momjian 已提交
860
 *
861 862 863 864 865 866
 *	The most common values are determined by brute force: we keep a list
 *	of previously seen values, ordered by number of times seen, as we scan
 *	the samples.  A newly seen value is inserted just after the last
 *	multiply-seen value, causing the bottommost (oldest) singly-seen value
 *	to drop off the list.  The accuracy of this method, and also its cost,
 *	depend mainly on the length of the list we are willing to keep.
B
Bruce Momjian 已提交
867 868
 */
static void
869
compute_minimal_stats(VacAttrStats *stats,
870
					  TupleDesc tupDesc, double totalrows,
871
					  HeapTuple *rows, int numrows)
B
Bruce Momjian 已提交
872 873
{
	int			i;
874 875 876 877 878 879
	int			null_cnt = 0;
	int			nonnull_cnt = 0;
	int			toowide_cnt = 0;
	double		total_width = 0;
	bool		is_varlena = (!stats->attr->attbyval &&
							  stats->attr->attlen == -1);
880 881
	bool		is_varwidth = (!stats->attr->attbyval &&
							   stats->attr->attlen < 0);
882 883 884
	FmgrInfo	f_cmpeq;
	typedef struct
	{
885 886
		Datum		value;
		int			count;
887 888 889 890 891 892
	} TrackItem;
	TrackItem  *track;
	int			track_cnt,
				track_max;
	int			num_mcv = stats->attr->attstattarget;

893 894 895 896
	/*
	 * We track up to 2*n values for an n-element MCV list; but at least
	 * 10
	 */
897 898 899 900 901 902 903 904 905
	track_max = 2 * num_mcv;
	if (track_max < 10)
		track_max = 10;
	track = (TrackItem *) palloc(track_max * sizeof(TrackItem));
	track_cnt = 0;

	fmgr_info(stats->eqfunc, &f_cmpeq);

	for (i = 0; i < numrows; i++)
B
Bruce Momjian 已提交
906
	{
907
		HeapTuple	tuple = rows[i];
908 909
		Datum		value;
		bool		isnull;
910 911 912
		bool		match;
		int			firstcount1,
					j;
913

914 915
		CHECK_FOR_INTERRUPTS();

916
		value = heap_getattr(tuple, stats->attnum, tupDesc, &isnull);
B
Bruce Momjian 已提交
917

918
		/* Check for null/nonnull */
B
Bruce Momjian 已提交
919
		if (isnull)
920
		{
921
			null_cnt++;
922 923
			continue;
		}
924
		nonnull_cnt++;
925 926

		/*
927
		 * If it's a variable-width field, add up widths for average width
928 929 930
		 * calculation.  Note that if the value is toasted, we use the
		 * toasted width.  We don't bother with this calculation if it's a
		 * fixed-width type.
931
		 */
932
		if (is_varlena)
B
Bruce Momjian 已提交
933
		{
934
			total_width += VARSIZE(DatumGetPointer(value));
935

936 937
			/*
			 * If the value is toasted, we want to detoast it just once to
938 939 940 941
			 * avoid repeated detoastings and resultant excess memory
			 * usage during the comparisons.  Also, check to see if the
			 * value is excessively wide, and if so don't detoast at all
			 * --- just ignore the value.
942 943
			 */
			if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
B
Bruce Momjian 已提交
944
			{
945 946
				toowide_cnt++;
				continue;
B
Bruce Momjian 已提交
947
			}
948
			value = PointerGetDatum(PG_DETOAST_DATUM(value));
949
		}
950 951 952 953 954
		else if (is_varwidth)
		{
			/* must be cstring */
			total_width += strlen(DatumGetCString(value)) + 1;
		}
B
Bruce Momjian 已提交
955

956 957 958 959 960 961
		/*
		 * See if the value matches anything we're already tracking.
		 */
		match = false;
		firstcount1 = track_cnt;
		for (j = 0; j < track_cnt; j++)
962
		{
963
			if (DatumGetBool(FunctionCall2(&f_cmpeq, value, track[j].value)))
B
Bruce Momjian 已提交
964
			{
965 966
				match = true;
				break;
B
Bruce Momjian 已提交
967
			}
968 969 970
			if (j < firstcount1 && track[j].count == 1)
				firstcount1 = j;
		}
971

972 973 974 975 976
		if (match)
		{
			/* Found a match */
			track[j].count++;
			/* This value may now need to "bubble up" in the track list */
977
			while (j > 0 && track[j].count > track[j - 1].count)
B
Bruce Momjian 已提交
978
			{
979 980
				swapDatum(track[j].value, track[j - 1].value);
				swapInt(track[j].count, track[j - 1].count);
981
				j--;
B
Bruce Momjian 已提交
982
			}
983
		}
984
		else
985
		{
986 987 988
			/* No match.  Insert at head of count-1 list */
			if (track_cnt < track_max)
				track_cnt++;
989
			for (j = track_cnt - 1; j > firstcount1; j--)
990
			{
991 992
				track[j].value = track[j - 1].value;
				track[j].count = track[j - 1].count;
993 994 995 996 997 998
			}
			if (firstcount1 < track_cnt)
			{
				track[firstcount1].value = value;
				track[firstcount1].count = 1;
			}
999
		}
1000 1001 1002 1003 1004
	}

	/* We can only compute valid stats if we found some non-null values. */
	if (nonnull_cnt > 0)
	{
1005 1006
		int			nmultiple,
					summultiple;
1007 1008 1009 1010

		stats->stats_valid = true;
		/* Do the simple null-frac and width stats */
		stats->stanullfrac = (double) null_cnt / (double) numrows;
1011
		if (is_varwidth)
1012
			stats->stawidth = total_width / (double) nonnull_cnt;
1013
		else
1014
			stats->stawidth = stats->attrtype->typlen;
1015

1016 1017 1018
		/* Count the number of values we found multiple times */
		summultiple = 0;
		for (nmultiple = 0; nmultiple < track_cnt; nmultiple++)
1019
		{
1020 1021 1022
			if (track[nmultiple].count == 1)
				break;
			summultiple += track[nmultiple].count;
1023
		}
1024 1025

		if (nmultiple == 0)
1026
		{
1027 1028
			/* If we found no repeated values, assume it's a unique column */
			stats->stadistinct = -1.0;
1029
		}
1030 1031
		else if (track_cnt < track_max && toowide_cnt == 0 &&
				 nmultiple == track_cnt)
1032
		{
1033
			/*
1034 1035 1036
			 * Our track list includes every value in the sample, and
			 * every value appeared more than once.  Assume the column has
			 * just these values.
1037 1038
			 */
			stats->stadistinct = track_cnt;
B
Bruce Momjian 已提交
1039
		}
1040 1041 1042 1043
		else
		{
			/*----------
			 * Estimate the number of distinct values using the estimator
1044 1045 1046 1047 1048 1049 1050 1051 1052
			 * proposed by Haas and Stokes in IBM Research Report RJ 10025:
			 *		n*d / (n - f1 + f1*n/N)
			 * where f1 is the number of distinct values that occurred
			 * exactly once in our sample of n rows (from a total of N),
			 * and d is the total number of distinct values in the sample.
			 * This is their Duj1 estimator; the other estimators they
			 * recommend are considerably more complex, and are numerically
			 * very unstable when n is much smaller than N.
			 *
1053 1054
			 * We assume (not very reliably!) that all the multiply-occurring
			 * values are reflected in the final track[] list, and the other
1055
			 * nonnull values all appeared but once.  (XXX this usually
1056
			 * results in a drastic overestimate of ndistinct.	Can we do
1057
			 * any better?)
1058 1059
			 *----------
			 */
1060
			int			f1 = nonnull_cnt - summultiple;
1061
			int			d = f1 + nmultiple;
B
Bruce Momjian 已提交
1062 1063 1064 1065 1066
			double		numer,
						denom,
						stadistinct;

			numer = (double) numrows *(double) d;
1067 1068

			denom = (double) (numrows - f1) +
B
Bruce Momjian 已提交
1069 1070
				(double) f1 *(double) numrows / totalrows;

1071 1072 1073 1074 1075 1076 1077
			stadistinct = numer / denom;
			/* Clamp to sane range in case of roundoff error */
			if (stadistinct < (double) d)
				stadistinct = (double) d;
			if (stadistinct > totalrows)
				stadistinct = totalrows;
			stats->stadistinct = floor(stadistinct + 0.5);
1078
		}
B
Bruce Momjian 已提交
1079

1080 1081 1082 1083 1084 1085 1086
		/*
		 * If we estimated the number of distinct values at more than 10%
		 * of the total row count (a very arbitrary limit), then assume
		 * that stadistinct should scale with the row count rather than be
		 * a fixed value.
		 */
		if (stats->stadistinct > 0.1 * totalrows)
1087
			stats->stadistinct = -(stats->stadistinct / totalrows);
B
Bruce Momjian 已提交
1088

1089 1090 1091 1092
		/*
		 * Decide how many values are worth storing as most-common values.
		 * If we are able to generate a complete MCV list (all the values
		 * in the sample will fit, and we think these are all the ones in
1093 1094 1095 1096
		 * the table), then do so.	Otherwise, store only those values
		 * that are significantly more common than the (estimated)
		 * average. We set the threshold rather arbitrarily at 25% more
		 * than average, with at least 2 instances in the sample.
1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
		 */
		if (track_cnt < track_max && toowide_cnt == 0 &&
			stats->stadistinct > 0 &&
			track_cnt <= num_mcv)
		{
			/* Track list includes all values seen, and all will fit */
			num_mcv = track_cnt;
		}
		else
		{
1107 1108 1109
			double		ndistinct = stats->stadistinct;
			double		avgcount,
						mincount;
1110 1111

			if (ndistinct < 0)
1112
				ndistinct = -ndistinct * totalrows;
1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
			/* estimate # of occurrences in sample of a typical value */
			avgcount = (double) numrows / ndistinct;
			/* set minimum threshold count to store a value */
			mincount = avgcount * 1.25;
			if (mincount < 2)
				mincount = 2;
			if (num_mcv > track_cnt)
				num_mcv = track_cnt;
			for (i = 0; i < num_mcv; i++)
			{
				if (track[i].count < mincount)
				{
					num_mcv = i;
					break;
				}
			}
		}

		/* Generate MCV slot entry */
1132
		if (num_mcv > 0)
B
Bruce Momjian 已提交
1133
		{
1134
			MemoryContext old_context;
1135 1136
			Datum	   *mcv_values;
			float4	   *mcv_freqs;
1137

1138 1139
			/* Must copy the target values into anl_context */
			old_context = MemoryContextSwitchTo(anl_context);
1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156
			mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum));
			mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4));
			for (i = 0; i < num_mcv; i++)
			{
				mcv_values[i] = datumCopy(track[i].value,
										  stats->attr->attbyval,
										  stats->attr->attlen);
				mcv_freqs[i] = (double) track[i].count / (double) numrows;
			}
			MemoryContextSwitchTo(old_context);

			stats->stakind[0] = STATISTIC_KIND_MCV;
			stats->staop[0] = stats->eqopr;
			stats->stanumbers[0] = mcv_freqs;
			stats->numnumbers[0] = num_mcv;
			stats->stavalues[0] = mcv_values;
			stats->numvalues[0] = num_mcv;
B
Bruce Momjian 已提交
1157 1158
		}
	}
1159 1160

	/* We don't need to bother cleaning up any of our temporary palloc's */
B
Bruce Momjian 已提交
1161 1162 1163 1164
}


/*
1165
 *	compute_scalar_stats() -- compute column statistics
B
Bruce Momjian 已提交
1166
 *
1167
 *	We use this when we can find "=" and "<" operators for the datatype.
1168
 *
1169 1170 1171
 *	We determine the fraction of non-null rows, the average width, the
 *	most common values, the (estimated) number of distinct values, the
 *	distribution histogram, and the correlation of physical to logical order.
B
Bruce Momjian 已提交
1172
 *
1173 1174
 *	The desired stats can be determined fairly easily after sorting the
 *	data values into order.
B
Bruce Momjian 已提交
1175 1176
 */
static void
1177
compute_scalar_stats(VacAttrStats *stats,
1178
					 TupleDesc tupDesc, double totalrows,
1179
					 HeapTuple *rows, int numrows)
B
Bruce Momjian 已提交
1180
{
1181 1182 1183 1184 1185 1186 1187
	int			i;
	int			null_cnt = 0;
	int			nonnull_cnt = 0;
	int			toowide_cnt = 0;
	double		total_width = 0;
	bool		is_varlena = (!stats->attr->attbyval &&
							  stats->attr->attlen == -1);
1188 1189
	bool		is_varwidth = (!stats->attr->attbyval &&
							   stats->attr->attlen < 0);
1190 1191 1192 1193 1194 1195 1196 1197 1198 1199
	double		corr_xysum;
	RegProcedure cmpFn;
	SortFunctionKind cmpFnKind;
	FmgrInfo	f_cmpfn;
	ScalarItem *values;
	int			values_cnt = 0;
	int		   *tupnoLink;
	ScalarMCVItem *track;
	int			track_cnt = 0;
	int			num_mcv = stats->attr->attstattarget;
1200
	int			num_bins = stats->attr->attstattarget;
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210

	values = (ScalarItem *) palloc(numrows * sizeof(ScalarItem));
	tupnoLink = (int *) palloc(numrows * sizeof(int));
	track = (ScalarMCVItem *) palloc(num_mcv * sizeof(ScalarMCVItem));

	SelectSortFunction(stats->ltopr, &cmpFn, &cmpFnKind);
	fmgr_info(cmpFn, &f_cmpfn);

	/* Initial scan to find sortable values */
	for (i = 0; i < numrows; i++)
B
Bruce Momjian 已提交
1211
	{
1212 1213 1214
		HeapTuple	tuple = rows[i];
		Datum		value;
		bool		isnull;
B
Bruce Momjian 已提交
1215

1216 1217
		CHECK_FOR_INTERRUPTS();

1218
		value = heap_getattr(tuple, stats->attnum, tupDesc, &isnull);
B
Bruce Momjian 已提交
1219

1220 1221
		/* Check for null/nonnull */
		if (isnull)
B
Bruce Momjian 已提交
1222
		{
1223 1224
			null_cnt++;
			continue;
B
Bruce Momjian 已提交
1225
		}
1226
		nonnull_cnt++;
B
Bruce Momjian 已提交
1227

1228
		/*
1229
		 * If it's a variable-width field, add up widths for average width
1230 1231 1232
		 * calculation.  Note that if the value is toasted, we use the
		 * toasted width.  We don't bother with this calculation if it's a
		 * fixed-width type.
1233 1234
		 */
		if (is_varlena)
B
Bruce Momjian 已提交
1235
		{
1236
			total_width += VARSIZE(DatumGetPointer(value));
1237

1238 1239
			/*
			 * If the value is toasted, we want to detoast it just once to
1240 1241 1242 1243
			 * avoid repeated detoastings and resultant excess memory
			 * usage during the comparisons.  Also, check to see if the
			 * value is excessively wide, and if so don't detoast at all
			 * --- just ignore the value.
1244 1245
			 */
			if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
B
Bruce Momjian 已提交
1246
			{
1247 1248
				toowide_cnt++;
				continue;
B
Bruce Momjian 已提交
1249
			}
1250 1251
			value = PointerGetDatum(PG_DETOAST_DATUM(value));
		}
1252 1253 1254 1255 1256
		else if (is_varwidth)
		{
			/* must be cstring */
			total_width += strlen(DatumGetCString(value)) + 1;
		}
B
Bruce Momjian 已提交
1257

1258 1259 1260 1261 1262 1263 1264 1265 1266 1267
		/* Add it to the list to be sorted */
		values[values_cnt].value = value;
		values[values_cnt].tupno = values_cnt;
		tupnoLink[values_cnt] = values_cnt;
		values_cnt++;
	}

	/* We can only compute valid stats if we found some sortable values. */
	if (values_cnt > 0)
	{
1268 1269 1270 1271 1272
		int			ndistinct,	/* # distinct values in sample */
					nmultiple,	/* # that appear multiple times */
					num_hist,
					dups_cnt;
		int			slot_idx = 0;
1273 1274 1275 1276 1277 1278 1279 1280 1281

		/* Sort the collected values */
		datumCmpFn = &f_cmpfn;
		datumCmpFnKind = cmpFnKind;
		datumCmpTupnoLink = tupnoLink;
		qsort((void *) values, values_cnt,
			  sizeof(ScalarItem), compare_scalars);

		/*
1282 1283
		 * Now scan the values in order, find the most common ones, and
		 * also accumulate ordering-correlation statistics.
1284 1285
		 *
		 * To determine which are most common, we first have to count the
1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299
		 * number of duplicates of each value.	The duplicates are
		 * adjacent in the sorted list, so a brute-force approach is to
		 * compare successive datum values until we find two that are not
		 * equal. However, that requires N-1 invocations of the datum
		 * comparison routine, which are completely redundant with work
		 * that was done during the sort.  (The sort algorithm must at
		 * some point have compared each pair of items that are adjacent
		 * in the sorted order; otherwise it could not know that it's
		 * ordered the pair correctly.) We exploit this by having
		 * compare_scalars remember the highest tupno index that each
		 * ScalarItem has been found equal to.	At the end of the sort, a
		 * ScalarItem's tupnoLink will still point to itself if and only
		 * if it is the last item of its group of duplicates (since the
		 * group will be ordered by tupno).
1300 1301 1302 1303 1304 1305 1306 1307 1308
		 */
		corr_xysum = 0;
		ndistinct = 0;
		nmultiple = 0;
		dups_cnt = 0;
		for (i = 0; i < values_cnt; i++)
		{
			int			tupno = values[i].tupno;

1309
			corr_xysum += ((double) i) * ((double) tupno);
1310 1311
			dups_cnt++;
			if (tupnoLink[tupno] == tupno)
B
Bruce Momjian 已提交
1312
			{
1313 1314 1315
				/* Reached end of duplicates of this value */
				ndistinct++;
				if (dups_cnt > 1)
B
Bruce Momjian 已提交
1316
				{
1317 1318
					nmultiple++;
					if (track_cnt < num_mcv ||
1319
						dups_cnt > track[track_cnt - 1].count)
1320 1321 1322 1323 1324 1325 1326
					{
						/*
						 * Found a new item for the mcv list; find its
						 * position, bubbling down old items if needed.
						 * Loop invariant is that j points at an empty/
						 * replaceable slot.
						 */
1327
						int			j;
1328 1329 1330

						if (track_cnt < num_mcv)
							track_cnt++;
1331
						for (j = track_cnt - 1; j > 0; j--)
1332
						{
1333
							if (dups_cnt <= track[j - 1].count)
1334
								break;
1335 1336
							track[j].count = track[j - 1].count;
							track[j].first = track[j - 1].first;
1337 1338 1339 1340 1341 1342 1343 1344
						}
						track[j].count = dups_cnt;
						track[j].first = i + 1 - dups_cnt;
					}
				}
				dups_cnt = 0;
			}
		}
B
Bruce Momjian 已提交
1345

1346 1347 1348
		stats->stats_valid = true;
		/* Do the simple null-frac and width stats */
		stats->stanullfrac = (double) null_cnt / (double) numrows;
1349
		if (is_varwidth)
1350 1351 1352
			stats->stawidth = total_width / (double) nonnull_cnt;
		else
			stats->stawidth = stats->attrtype->typlen;
B
Bruce Momjian 已提交
1353

1354 1355 1356 1357 1358 1359 1360 1361
		if (nmultiple == 0)
		{
			/* If we found no repeated values, assume it's a unique column */
			stats->stadistinct = -1.0;
		}
		else if (toowide_cnt == 0 && nmultiple == ndistinct)
		{
			/*
1362 1363
			 * Every value in the sample appeared more than once.  Assume
			 * the column has just these values.
1364 1365 1366 1367 1368 1369 1370
			 */
			stats->stadistinct = ndistinct;
		}
		else
		{
			/*----------
			 * Estimate the number of distinct values using the estimator
1371 1372 1373 1374 1375 1376 1377 1378 1379
			 * proposed by Haas and Stokes in IBM Research Report RJ 10025:
			 *		n*d / (n - f1 + f1*n/N)
			 * where f1 is the number of distinct values that occurred
			 * exactly once in our sample of n rows (from a total of N),
			 * and d is the total number of distinct values in the sample.
			 * This is their Duj1 estimator; the other estimators they
			 * recommend are considerably more complex, and are numerically
			 * very unstable when n is much smaller than N.
			 *
1380 1381 1382
			 * Overwidth values are assumed to have been distinct.
			 *----------
			 */
1383
			int			f1 = ndistinct - nmultiple + toowide_cnt;
1384
			int			d = f1 + nmultiple;
B
Bruce Momjian 已提交
1385 1386 1387 1388 1389
			double		numer,
						denom,
						stadistinct;

			numer = (double) numrows *(double) d;
1390 1391

			denom = (double) (numrows - f1) +
B
Bruce Momjian 已提交
1392 1393
				(double) f1 *(double) numrows / totalrows;

1394 1395 1396 1397 1398 1399 1400
			stadistinct = numer / denom;
			/* Clamp to sane range in case of roundoff error */
			if (stadistinct < (double) d)
				stadistinct = (double) d;
			if (stadistinct > totalrows)
				stadistinct = totalrows;
			stats->stadistinct = floor(stadistinct + 0.5);
1401 1402 1403 1404 1405 1406 1407 1408 1409
		}

		/*
		 * If we estimated the number of distinct values at more than 10%
		 * of the total row count (a very arbitrary limit), then assume
		 * that stadistinct should scale with the row count rather than be
		 * a fixed value.
		 */
		if (stats->stadistinct > 0.1 * totalrows)
1410
			stats->stadistinct = -(stats->stadistinct / totalrows);
1411

1412 1413 1414 1415
		/*
		 * Decide how many values are worth storing as most-common values.
		 * If we are able to generate a complete MCV list (all the values
		 * in the sample will fit, and we think these are all the ones in
1416 1417 1418 1419 1420 1421 1422 1423
		 * the table), then do so.	Otherwise, store only those values
		 * that are significantly more common than the (estimated)
		 * average. We set the threshold rather arbitrarily at 25% more
		 * than average, with at least 2 instances in the sample.  Also,
		 * we won't suppress values that have a frequency of at least 1/K
		 * where K is the intended number of histogram bins; such values
		 * might otherwise cause us to emit duplicate histogram bin
		 * boundaries.
1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
		 */
		if (track_cnt == ndistinct && toowide_cnt == 0 &&
			stats->stadistinct > 0 &&
			track_cnt <= num_mcv)
		{
			/* Track list includes all values seen, and all will fit */
			num_mcv = track_cnt;
		}
		else
		{
1434 1435 1436 1437
			double		ndistinct = stats->stadistinct;
			double		avgcount,
						mincount,
						maxmincount;
1438 1439

			if (ndistinct < 0)
1440
				ndistinct = -ndistinct * totalrows;
1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463
			/* estimate # of occurrences in sample of a typical value */
			avgcount = (double) numrows / ndistinct;
			/* set minimum threshold count to store a value */
			mincount = avgcount * 1.25;
			if (mincount < 2)
				mincount = 2;
			/* don't let threshold exceed 1/K, however */
			maxmincount = (double) numrows / (double) num_bins;
			if (mincount > maxmincount)
				mincount = maxmincount;
			if (num_mcv > track_cnt)
				num_mcv = track_cnt;
			for (i = 0; i < num_mcv; i++)
			{
				if (track[i].count < mincount)
				{
					num_mcv = i;
					break;
				}
			}
		}

		/* Generate MCV slot entry */
1464 1465 1466
		if (num_mcv > 0)
		{
			MemoryContext old_context;
1467 1468
			Datum	   *mcv_values;
			float4	   *mcv_freqs;
1469

1470 1471
			/* Must copy the target values into anl_context */
			old_context = MemoryContextSwitchTo(anl_context);
1472 1473 1474 1475 1476 1477 1478 1479
			mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum));
			mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4));
			for (i = 0; i < num_mcv; i++)
			{
				mcv_values[i] = datumCopy(values[track[i].first].value,
										  stats->attr->attbyval,
										  stats->attr->attlen);
				mcv_freqs[i] = (double) track[i].count / (double) numrows;
B
Bruce Momjian 已提交
1480
			}
1481 1482 1483 1484 1485 1486 1487 1488 1489 1490
			MemoryContextSwitchTo(old_context);

			stats->stakind[slot_idx] = STATISTIC_KIND_MCV;
			stats->staop[slot_idx] = stats->eqopr;
			stats->stanumbers[slot_idx] = mcv_freqs;
			stats->numnumbers[slot_idx] = num_mcv;
			stats->stavalues[slot_idx] = mcv_values;
			stats->numvalues[slot_idx] = num_mcv;
			slot_idx++;
		}
B
Bruce Momjian 已提交
1491

1492 1493 1494 1495 1496 1497
		/*
		 * Generate a histogram slot entry if there are at least two
		 * distinct values not accounted for in the MCV list.  (This
		 * ensures the histogram won't collapse to empty or a singleton.)
		 */
		num_hist = ndistinct - num_mcv;
1498 1499
		if (num_hist > num_bins)
			num_hist = num_bins + 1;
1500 1501 1502
		if (num_hist >= 2)
		{
			MemoryContext old_context;
1503 1504
			Datum	   *hist_values;
			int			nvals;
B
Bruce Momjian 已提交
1505

1506 1507 1508
			/* Sort the MCV items into position order to speed next loop */
			qsort((void *) track, num_mcv,
				  sizeof(ScalarMCVItem), compare_mcvs);
B
Bruce Momjian 已提交
1509 1510

			/*
1511
			 * Collapse out the MCV items from the values[] array.
B
Bruce Momjian 已提交
1512
			 *
1513
			 * Note we destroy the values[] array here... but we don't need
1514 1515 1516
			 * it for anything more.  We do, however, still need
			 * values_cnt. nvals will be the number of remaining entries
			 * in values[].
B
Bruce Momjian 已提交
1517
			 */
1518
			if (num_mcv > 0)
B
Bruce Momjian 已提交
1519
			{
1520 1521 1522
				int			src,
							dest;
				int			j;
B
Bruce Momjian 已提交
1523

1524 1525 1526 1527
				src = dest = 0;
				j = 0;			/* index of next interesting MCV item */
				while (src < values_cnt)
				{
1528
					int			ncopy;
1529 1530 1531

					if (j < num_mcv)
					{
1532
						int			first = track[j].first;
1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554

						if (src >= first)
						{
							/* advance past this MCV item */
							src = first + track[j].count;
							j++;
							continue;
						}
						ncopy = first - src;
					}
					else
						ncopy = values_cnt - src;
					memmove(&values[dest], &values[src],
							ncopy * sizeof(ScalarItem));
					src += ncopy;
					dest += ncopy;
				}
				nvals = dest;
			}
			else
				nvals = values_cnt;
			Assert(nvals >= num_hist);
B
Bruce Momjian 已提交
1555

1556 1557
			/* Must copy the target values into anl_context */
			old_context = MemoryContextSwitchTo(anl_context);
1558 1559 1560
			hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
			for (i = 0; i < num_hist; i++)
			{
1561
				int			pos;
B
Bruce Momjian 已提交
1562

1563 1564 1565 1566
				pos = (i * (nvals - 1)) / (num_hist - 1);
				hist_values[i] = datumCopy(values[pos].value,
										   stats->attr->attbyval,
										   stats->attr->attlen);
B
Bruce Momjian 已提交
1567
			}
1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580
			MemoryContextSwitchTo(old_context);

			stats->stakind[slot_idx] = STATISTIC_KIND_HISTOGRAM;
			stats->staop[slot_idx] = stats->ltopr;
			stats->stavalues[slot_idx] = hist_values;
			stats->numvalues[slot_idx] = num_hist;
			slot_idx++;
		}

		/* Generate a correlation entry if there are multiple values */
		if (values_cnt > 1)
		{
			MemoryContext old_context;
1581 1582 1583
			float4	   *corrs;
			double		corr_xsum,
						corr_x2sum;
1584

1585 1586
			/* Must copy the target values into anl_context */
			old_context = MemoryContextSwitchTo(anl_context);
1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598
			corrs = (float4 *) palloc(sizeof(float4));
			MemoryContextSwitchTo(old_context);

			/*----------
			 * Since we know the x and y value sets are both
			 *		0, 1, ..., values_cnt-1
			 * we have sum(x) = sum(y) =
			 *		(values_cnt-1)*values_cnt / 2
			 * and sum(x^2) = sum(y^2) =
			 *		(values_cnt-1)*values_cnt*(2*values_cnt-1) / 6.
			 *----------
			 */
1599 1600 1601 1602
			corr_xsum = ((double) (values_cnt - 1)) *
				((double) values_cnt) / 2.0;
			corr_x2sum = ((double) (values_cnt - 1)) *
				((double) values_cnt) * (double) (2 * values_cnt - 1) / 6.0;
1603

1604 1605 1606 1607 1608 1609 1610 1611 1612
			/* And the correlation coefficient reduces to */
			corrs[0] = (values_cnt * corr_xysum - corr_xsum * corr_xsum) /
				(values_cnt * corr_x2sum - corr_xsum * corr_xsum);

			stats->stakind[slot_idx] = STATISTIC_KIND_CORRELATION;
			stats->staop[slot_idx] = stats->ltopr;
			stats->stanumbers[slot_idx] = corrs;
			stats->numnumbers[slot_idx] = 1;
			slot_idx++;
B
Bruce Momjian 已提交
1613 1614
		}
	}
1615 1616

	/* We don't need to bother cleaning up any of our temporary palloc's */
B
Bruce Momjian 已提交
1617 1618 1619
}

/*
1620
 * qsort comparator for sorting ScalarItems
B
Bruce Momjian 已提交
1621
 *
1622
 * Aside from sorting the items, we update the datumCmpTupnoLink[] array
1623
 * whenever two ScalarItems are found to contain equal datums.	The array
1624 1625 1626
 * is indexed by tupno; for each ScalarItem, it contains the highest
 * tupno that that item's datum has been found to be equal to.  This allows
 * us to avoid additional comparisons in compute_scalar_stats().
B
Bruce Momjian 已提交
1627
 */
1628 1629
static int
compare_scalars(const void *a, const void *b)
B
Bruce Momjian 已提交
1630
{
1631 1632 1633 1634
	Datum		da = ((ScalarItem *) a)->value;
	int			ta = ((ScalarItem *) a)->tupno;
	Datum		db = ((ScalarItem *) b)->value;
	int			tb = ((ScalarItem *) b)->tupno;
1635
	int32		compare;
B
Bruce Momjian 已提交
1636

1637 1638 1639 1640
	compare = ApplySortFunction(datumCmpFn, datumCmpFnKind,
								da, false, db, false);
	if (compare != 0)
		return compare;
B
Bruce Momjian 已提交
1641

1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674
	/*
	 * The two datums are equal, so update datumCmpTupnoLink[].
	 */
	if (datumCmpTupnoLink[ta] < tb)
		datumCmpTupnoLink[ta] = tb;
	if (datumCmpTupnoLink[tb] < ta)
		datumCmpTupnoLink[tb] = ta;

	/*
	 * For equal datums, sort by tupno
	 */
	return ta - tb;
}

/*
 * qsort comparator for sorting ScalarMCVItems by position
 */
static int
compare_mcvs(const void *a, const void *b)
{
	int			da = ((ScalarMCVItem *) a)->first;
	int			db = ((ScalarMCVItem *) b)->first;

	return da - db;
}


/*
 *	update_attstats() -- update attribute statistics for one relation
 *
 *		Statistics are stored in several places: the pg_class row for the
 *		relation has stats about the whole relation, and there is a
 *		pg_statistic row for each (non-system) attribute that has ever
1675
 *		been analyzed.	The pg_class values are updated by VACUUM, not here.
1676 1677 1678 1679 1680 1681 1682 1683 1684 1685
 *
 *		pg_statistic rows are just added or updated normally.  This means
 *		that pg_statistic will probably contain some deleted rows at the
 *		completion of a vacuum cycle, unless it happens to get vacuumed last.
 *
 *		To keep things simple, we punt for pg_statistic, and don't try
 *		to compute or store rows for pg_statistic itself in pg_statistic.
 *		This could possibly be made to work, but it's not worth the trouble.
 *		Note analyze_rel() has seen to it that we won't come here when
 *		vacuuming pg_statistic itself.
1686 1687 1688 1689 1690 1691 1692 1693
 *
 *		Note: if two backends concurrently try to analyze the same relation,
 *		the second one is likely to fail here with a "tuple concurrently
 *		updated" error.  This is slightly annoying, but no real harm is done.
 *		We could prevent the problem by using a stronger lock on the
 *		relation for ANALYZE (ie, ShareUpdateExclusiveLock instead
 *		of AccessShareLock); but that cure seems worse than the disease,
 *		especially now that ANALYZE doesn't start a new transaction
B
Bruce Momjian 已提交
1694
 *		for each relation.	The lock could be held for a long time...
1695 1696 1697 1698 1699 1700 1701
 */
static void
update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats)
{
	Relation	sd;
	int			attno;

1702
	sd = heap_openr(StatisticRelationName, RowExclusiveLock);
1703 1704

	for (attno = 0; attno < natts; attno++)
B
Bruce Momjian 已提交
1705
	{
1706 1707 1708
		VacAttrStats *stats = vacattrstats[attno];
		HeapTuple	stup,
					oldtup;
1709 1710 1711
		int			i,
					k,
					n;
1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723
		Datum		values[Natts_pg_statistic];
		char		nulls[Natts_pg_statistic];
		char		replaces[Natts_pg_statistic];

		/* Ignore attr if we weren't able to collect stats */
		if (!stats->stats_valid)
			continue;

		/*
		 * Construct a new pg_statistic tuple
		 */
		for (i = 0; i < Natts_pg_statistic; ++i)
B
Bruce Momjian 已提交
1724
		{
1725 1726 1727
			nulls[i] = ' ';
			replaces[i] = 'r';
		}
B
Bruce Momjian 已提交
1728

1729
		i = 0;
1730 1731 1732 1733 1734
		values[i++] = ObjectIdGetDatum(relid);	/* starelid */
		values[i++] = Int16GetDatum(stats->attnum);		/* staattnum */
		values[i++] = Float4GetDatum(stats->stanullfrac);		/* stanullfrac */
		values[i++] = Int32GetDatum(stats->stawidth);	/* stawidth */
		values[i++] = Float4GetDatum(stats->stadistinct);		/* stadistinct */
1735 1736
		for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
		{
1737
			values[i++] = Int16GetDatum(stats->stakind[k]);		/* stakindN */
1738 1739 1740
		}
		for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
		{
1741
			values[i++] = ObjectIdGetDatum(stats->staop[k]);	/* staopN */
1742 1743 1744
		}
		for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
		{
1745
			int			nnum = stats->numnumbers[k];
1746 1747

			if (nnum > 0)
B
Bruce Momjian 已提交
1748
			{
1749 1750 1751 1752 1753 1754 1755
				Datum	   *numdatums = (Datum *) palloc(nnum * sizeof(Datum));
				ArrayType  *arry;

				for (n = 0; n < nnum; n++)
					numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]);
				/* XXX knows more than it should about type float4: */
				arry = construct_array(numdatums, nnum,
1756 1757
									   FLOAT4OID,
									   sizeof(float4), false, 'i');
1758
				values[i++] = PointerGetDatum(arry);	/* stanumbersN */
1759 1760 1761 1762 1763
			}
			else
			{
				nulls[i] = 'n';
				values[i++] = (Datum) 0;
B
Bruce Momjian 已提交
1764 1765
			}
		}
1766 1767
		for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
		{
1768
			if (stats->numvalues[k] > 0)
1769 1770
			{
				ArrayType  *arry;
B
Bruce Momjian 已提交
1771

1772 1773 1774 1775 1776 1777
				arry = construct_array(stats->stavalues[k],
									   stats->numvalues[k],
									   stats->attr->atttypid,
									   stats->attrtype->typlen,
									   stats->attrtype->typbyval,
									   stats->attrtype->typalign);
1778
				values[i++] = PointerGetDatum(arry);	/* stavaluesN */
1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807
			}
			else
			{
				nulls[i] = 'n';
				values[i++] = (Datum) 0;
			}
		}

		/* Is there already a pg_statistic tuple for this attribute? */
		oldtup = SearchSysCache(STATRELATT,
								ObjectIdGetDatum(relid),
								Int16GetDatum(stats->attnum),
								0, 0);

		if (HeapTupleIsValid(oldtup))
		{
			/* Yes, replace it */
			stup = heap_modifytuple(oldtup,
									sd,
									values,
									nulls,
									replaces);
			ReleaseSysCache(oldtup);
			simple_heap_update(sd, &stup->t_self, stup);
		}
		else
		{
			/* No, insert new tuple */
			stup = heap_formtuple(sd->rd_att, values, nulls);
1808
			simple_heap_insert(sd, stup);
1809 1810
		}

1811 1812
		/* update indexes too */
		CatalogUpdateIndexes(sd, stup);
1813 1814 1815 1816

		heap_freetuple(stup);
	}

1817
	heap_close(sd, RowExclusiveLock);
B
Bruce Momjian 已提交
1818
}