diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 07c4da115f59c6eec63cee3874194fdb3401c337..1c9b3c60b9d70b7fa28fe51e309fb1e90b2be0f6 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.90 2001/05/20 20:28:19 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.91 2001/05/27 17:37:48 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -940,9 +940,7 @@ Datum eqjoinsel(PG_FUNCTION_ARGS) { Query *root = (Query *) PG_GETARG_POINTER(0); -#ifdef NOT_USED /* see neqjoinsel() before removing me! */ Oid operator = PG_GETARG_OID(1); -#endif List *args = (List *) PG_GETARG_POINTER(2); Var *var1; Var *var2; @@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS) HeapTuple statsTuple2 = NULL; Form_pg_statistic stats1 = NULL; Form_pg_statistic stats2 = NULL; - double nd1, - nd2; - - if (var1 == NULL) - { - nd1 = DEFAULT_NUM_DISTINCT; - } - else + double nd1 = DEFAULT_NUM_DISTINCT; + double nd2 = DEFAULT_NUM_DISTINCT; + bool have_mcvs1 = false; + Datum *values1 = NULL; + int nvalues1 = 0; + float4 *numbers1 = NULL; + int nnumbers1 = 0; + bool have_mcvs2 = false; + Datum *values2 = NULL; + int nvalues2 = 0; + float4 *numbers2 = NULL; + int nnumbers2 = 0; + + if (var1 != NULL) { /* get stats for the attribute, if available */ Oid relid1 = getrelid(var1->varno, root->rtable); - if (relid1 == InvalidOid) - nd1 = DEFAULT_NUM_DISTINCT; - else + if (relid1 != InvalidOid) { statsTuple1 = SearchSysCache(STATRELATT, ObjectIdGetDatum(relid1), Int16GetDatum(var1->varattno), 0, 0); if (HeapTupleIsValid(statsTuple1)) + { stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1); + have_mcvs1 = get_attstatsslot(statsTuple1, + var1->vartype, + var1->vartypmod, + STATISTIC_KIND_MCV, + InvalidOid, + &values1, &nvalues1, + &numbers1, &nnumbers1); + } nd1 = get_att_numdistinct(root, var1, stats1); } } - if (var2 == NULL) - { - nd2 = DEFAULT_NUM_DISTINCT; - } - else + if (var2 != NULL) { /* get stats for the attribute, if available */ Oid relid2 = getrelid(var2->varno, root->rtable); - if (relid2 == InvalidOid) - nd2 = DEFAULT_NUM_DISTINCT; - else + if (relid2 != InvalidOid) { statsTuple2 = SearchSysCache(STATRELATT, ObjectIdGetDatum(relid2), Int16GetDatum(var2->varattno), 0, 0); if (HeapTupleIsValid(statsTuple2)) + { stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2); + have_mcvs2 = get_attstatsslot(statsTuple2, + var2->vartype, + var2->vartypmod, + STATISTIC_KIND_MCV, + InvalidOid, + &values2, &nvalues2, + &numbers2, &nnumbers2); + } nd2 = get_att_numdistinct(root, var2, stats2); } } - /* - * Estimate the join selectivity as 1 / sqrt(nd1*nd2) - * (can we produce any theory for this)? - * - * XXX possibility to do better: if both attributes have histograms - * then we could determine the exact join selectivity between the - * MCV sets, and only have to assume the join behavior of the non-MCV - * values. This could be a big win when the MCVs cover a large part - * of the population. - * - * XXX what about nulls? - */ - selec = 1.0 / sqrt(nd1 * nd2); - if (selec > 1.0) - selec = 1.0; + if (have_mcvs1 && have_mcvs2) + { + /* + * We have most-common-value lists for both relations. Run + * through the lists to see which MCVs actually join to each + * other with the given operator. This allows us to determine + * the exact join selectivity for the portion of the relations + * represented by the MCV lists. We still have to estimate for + * the remaining population, but in a skewed distribution this + * gives us a big leg up in accuracy. For motivation see the + * analysis in Y. Ioannidis and S. Christodoulakis, "On the + * propagation of errors in the size of join results", Technical + * Report 1018, Computer Science Dept., University of Wisconsin, + * Madison, March 1991 (available from ftp.cs.wisc.edu). + */ + FmgrInfo eqproc; + bool *hasmatch1; + bool *hasmatch2; + double matchprodfreq, + matchfreq1, + matchfreq2, + unmatchfreq1, + unmatchfreq2, + otherfreq1, + otherfreq2, + totalsel1, + totalsel2; + int i, + nmatches; + + fmgr_info(get_opcode(operator), &eqproc); + hasmatch1 = (bool *) palloc(nvalues1 * sizeof(bool)); + memset(hasmatch1, 0, nvalues1 * sizeof(bool)); + hasmatch2 = (bool *) palloc(nvalues2 * sizeof(bool)); + memset(hasmatch2, 0, nvalues2 * sizeof(bool)); + /* + * Note we assume that each MCV will match at most one member of + * the other MCV list. If the operator isn't really equality, + * there could be multiple matches --- but we don't look for them, + * both for speed and because the math wouldn't add up... + */ + matchprodfreq = 0.0; + nmatches = 0; + for (i = 0; i < nvalues1; i++) + { + int j; + for (j = 0; j < nvalues2; j++) + { + if (hasmatch2[j]) + continue; + if (DatumGetBool(FunctionCall2(&eqproc, + values1[i], + values2[j]))) + { + hasmatch1[i] = hasmatch2[j] = true; + matchprodfreq += numbers1[i] * numbers2[j]; + nmatches++; + break; + } + } + } + /* Sum up frequencies of matched and unmatched MCVs */ + matchfreq1 = unmatchfreq1 = 0.0; + for (i = 0; i < nvalues1; i++) + { + if (hasmatch1[i]) + matchfreq1 += numbers1[i]; + else + unmatchfreq1 += numbers1[i]; + } + matchfreq2 = unmatchfreq2 = 0.0; + for (i = 0; i < nvalues2; i++) + { + if (hasmatch2[i]) + matchfreq2 += numbers2[i]; + else + unmatchfreq2 += numbers2[i]; + } + pfree(hasmatch1); + pfree(hasmatch2); + /* + * Compute total frequency of non-null values that are not in + * the MCV lists. + */ + otherfreq1 = 1.0 - stats1->stanullfrac - matchfreq1 - unmatchfreq1; + otherfreq2 = 1.0 - stats2->stanullfrac - matchfreq2 - unmatchfreq2; + /* + * We can estimate the total selectivity from the point of view + * of relation 1 as: the known selectivity for matched MCVs, plus + * unmatched MCVs that are assumed to match against random members + * of relation 2's non-MCV population, plus non-MCV values that + * are assumed to match against random members of relation 2's + * unmatched MCVs plus non-MCV values. + */ + totalsel1 = matchprodfreq; + if (nd2 > nvalues2) + totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2); + if (nd2 > nmatches) + totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) / + (nd2 - nmatches); + /* Same estimate from the point of view of relation 2. */ + totalsel2 = matchprodfreq; + if (nd1 > nvalues1) + totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1); + if (nd1 > nmatches) + totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) / + (nd1 - nmatches); + /* + * For robustness, we average the two estimates. (Can a case + * be made for taking the min or max instead?) + */ + selec = (totalsel1 + totalsel2) * 0.5; + } + else + { + /* + * We do not have MCV lists for both sides. Estimate the + * join selectivity as MIN(1/nd1, 1/nd2). This is plausible + * if we assume that the values are about equally distributed: + * a given tuple of rel1 will join to either 0 or N2/nd2 rows + * of rel2, so total join rows are at most N1*N2/nd2 giving + * a join selectivity of not more than 1/nd2. By the same logic + * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper + * bound. Using the MIN() means we estimate from the point of + * view of the relation with smaller nd (since the larger nd is + * determining the MIN). It is reasonable to assume that most + * tuples in this rel will have join partners, so the bound is + * probably reasonably tight and should be taken as-is. + * + * XXX Can we be smarter if we have an MCV list for just one side? + * It seems that if we assume equal distribution for the other + * side, we end up with the same answer anyway. + */ + if (nd1 > nd2) + selec = 1.0 / nd1; + else + selec = 1.0 / nd2; + } + + if (have_mcvs1) + free_attstatsslot(var1->vartype, values1, nvalues1, + numbers1, nnumbers1); + if (have_mcvs2) + free_attstatsslot(var2->vartype, values2, nvalues2, + numbers2, nnumbers2); if (HeapTupleIsValid(statsTuple1)) ReleaseSysCache(statsTuple1); if (HeapTupleIsValid(statsTuple2)) @@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS) Datum neqjoinsel(PG_FUNCTION_ARGS) { + Query *root = (Query *) PG_GETARG_POINTER(0); + Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + Oid eqop; float8 result; /* - * XXX we skip looking up the negator operator here because we know - * eqjoinsel() won't look at it anyway. If eqjoinsel() ever does - * look, this routine will need to look more like neqsel() does. + * We want 1 - eqjoinsel() where the equality operator is the one + * associated with this != operator, that is, its negator. */ - result = DatumGetFloat8(eqjoinsel(fcinfo)); + eqop = get_negator(operator); + if (eqop) + { + result = DatumGetFloat8(DirectFunctionCall3(eqjoinsel, + PointerGetDatum(root), + ObjectIdGetDatum(eqop), + PointerGetDatum(args))); + + } + else + { + /* Use default selectivity (should we raise an error instead?) */ + result = DEFAULT_EQ_SEL; + } result = 1.0 - result; PG_RETURN_FLOAT8(result); }