From 1db5af279441b9ee215b54de424c2af92eeb1ef8 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 20 Dec 2011 19:57:34 -0500 Subject: [PATCH] Fix gincostestimate to handle ScalarArrayOpExpr reasonably. The original coding of this function overlooked the possibility that it could be passed anything except simple OpExpr indexquals. But ScalarArrayOpExpr is possible too, and the code would probably crash (and surely give ridiculous answers) in such a case. Add logic to try to estimate sanely for such cases. In passing, fix the treatment of inner-indexscan cost estimation: it was failing to scale up properly for multiple iterations of a nestloop. (I think somebody might've thought that index_pages_fetched() is linear, but of course it's not.) Report, diagnosis, and preliminary patch by Marti Raudsepp; I refactored it a bit and fixed the cost estimation. Back-patch into 9.1 where the bogus code was introduced. --- src/backend/utils/adt/selfuncs.c | 496 ++++++++++++++++++-------- src/test/regress/expected/tsearch.out | 12 + src/test/regress/sql/tsearch.sql | 2 + 3 files changed, 366 insertions(+), 144 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index d06809e767..abef8abc38 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6575,6 +6575,20 @@ spgcostestimate(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * Support routines for gincostestimate + */ + +typedef struct +{ + bool haveFullScan; + double partialEntries; + double exactEntries; + double searchEntries; + double arrayScans; +} GinQualCounts; + /* Find the index column matching "op"; return its index, or -1 if no match */ static int find_index_column(Node *op, IndexOptInfo *index) @@ -6590,6 +6604,277 @@ find_index_column(Node *op, IndexOptInfo *index) return -1; } +/* + * Estimate the number of index terms that need to be searched for while + * testing the given GIN query, and increment the counts in *counts + * appropriately. If the query is unsatisfiable, return false. + */ +static bool +gincost_pattern(IndexOptInfo *index, int indexcol, + Oid clause_op, Datum query, + GinQualCounts *counts) +{ + Oid extractProcOid; + int strategy_op; + Oid lefttype, + righttype; + int32 nentries = 0; + bool *partial_matches = NULL; + Pointer *extra_data = NULL; + bool *nullFlags = NULL; + int32 searchMode = GIN_SEARCH_MODE_DEFAULT; + int32 i; + + /* + * Get the operator's strategy number and declared input data types + * within the index opfamily. (We don't need the latter, but we use + * get_op_opfamily_properties because it will throw error if it fails + * to find a matching pg_amop entry.) + */ + get_op_opfamily_properties(clause_op, index->opfamily[indexcol], false, + &strategy_op, &lefttype, &righttype); + + /* + * GIN always uses the "default" support functions, which are those + * with lefttype == righttype == the opclass' opcintype (see + * IndexSupportInitialize in relcache.c). + */ + extractProcOid = get_opfamily_proc(index->opfamily[indexcol], + index->opcintype[indexcol], + index->opcintype[indexcol], + GIN_EXTRACTQUERY_PROC); + + if (!OidIsValid(extractProcOid)) + { + /* should not happen; throw same error as index_getprocinfo */ + elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", + GIN_EXTRACTQUERY_PROC, indexcol + 1, + get_rel_name(index->indexoid)); + } + + OidFunctionCall7(extractProcOid, + query, + PointerGetDatum(&nentries), + UInt16GetDatum(strategy_op), + PointerGetDatum(&partial_matches), + PointerGetDatum(&extra_data), + PointerGetDatum(&nullFlags), + PointerGetDatum(&searchMode)); + + if (nentries <= 0 && searchMode == GIN_SEARCH_MODE_DEFAULT) + { + /* No match is possible */ + return false; + } + + for (i = 0; i < nentries; i++) + { + /* + * For partial match we haven't any information to estimate number of + * matched entries in index, so, we just estimate it as 100 + */ + if (partial_matches && partial_matches[i]) + counts->partialEntries += 100; + else + counts->exactEntries++; + + counts->searchEntries++; + } + + if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY) + { + /* Treat "include empty" like an exact-match item */ + counts->exactEntries++; + counts->searchEntries++; + } + else if (searchMode != GIN_SEARCH_MODE_DEFAULT) + { + /* It's GIN_SEARCH_MODE_ALL */ + counts->haveFullScan = true; + } + + return true; +} + +/* + * Estimate the number of index terms that need to be searched for while + * testing the given GIN index clause, and increment the counts in *counts + * appropriately. If the query is unsatisfiable, return false. + */ +static bool +gincost_opexpr(IndexOptInfo *index, OpExpr *clause, GinQualCounts *counts) +{ + Node *leftop = get_leftop((Expr *) clause); + Node *rightop = get_rightop((Expr *) clause); + Oid clause_op = clause->opno; + int indexcol; + Node *operand; + + /* Locate the operand being compared to the index column */ + if ((indexcol = find_index_column(leftop, index)) >= 0) + { + operand = rightop; + } + else if ((indexcol = find_index_column(rightop, index)) >= 0) + { + operand = leftop; + clause_op = get_commutator(clause_op); + } + else + { + elog(ERROR, "could not match index to operand"); + operand = NULL; /* keep compiler quiet */ + } + + if (IsA(operand, RelabelType)) + operand = (Node *) ((RelabelType *) operand)->arg; + + /* + * It's impossible to call extractQuery method for unknown operand. So + * unless operand is a Const we can't do much; just assume there will + * be one ordinary search entry from the operand at runtime. + */ + if (!IsA(operand, Const)) + { + counts->exactEntries++; + counts->searchEntries++; + return true; + } + + /* If Const is null, there can be no matches */ + if (((Const *) operand)->constisnull) + return false; + + /* Otherwise, apply extractQuery and get the actual term counts */ + return gincost_pattern(index, indexcol, clause_op, + ((Const *) operand)->constvalue, + counts); +} + +/* + * Estimate the number of index terms that need to be searched for while + * testing the given GIN index clause, and increment the counts in *counts + * appropriately. If the query is unsatisfiable, return false. + * + * A ScalarArrayOpExpr will give rise to N separate indexscans at runtime, + * each of which involves one value from the RHS array, plus all the + * non-array quals (if any). To model this, we average the counts across + * the RHS elements, and add the averages to the counts in *counts (which + * correspond to per-indexscan costs). We also multiply counts->arrayScans + * by N, causing gincostestimate to scale up its estimates accordingly. + */ +static bool +gincost_scalararrayopexpr(IndexOptInfo *index, ScalarArrayOpExpr *clause, + double numIndexEntries, + GinQualCounts *counts) +{ + Node *leftop = (Node *) linitial(clause->args); + Node *rightop = (Node *) lsecond(clause->args); + Oid clause_op = clause->opno; + int indexcol; + ArrayType *arrayval; + int16 elmlen; + bool elmbyval; + char elmalign; + int numElems; + Datum *elemValues; + bool *elemNulls; + GinQualCounts arraycounts; + int numPossible = 0; + int i; + + Assert(clause->useOr); + + /* index column must be on the left */ + if ((indexcol = find_index_column(leftop, index)) < 0) + elog(ERROR, "could not match index to operand"); + + if (IsA(rightop, RelabelType)) + rightop = (Node *) ((RelabelType *) rightop)->arg; + + /* + * It's impossible to call extractQuery method for unknown operand. So + * unless operand is a Const we can't do much; just assume there will + * be one ordinary search entry from each array entry at runtime, and + * fall back on a probably-bad estimate of the number of array entries. + */ + if (!IsA(rightop, Const)) + { + counts->exactEntries++; + counts->searchEntries++; + counts->arrayScans *= estimate_array_length(rightop); + return true; + } + + /* If Const is null, there can be no matches */ + if (((Const *) rightop)->constisnull) + return false; + + /* Otherwise, extract the array elements and iterate over them */ + arrayval = DatumGetArrayTypeP(((Const *) rightop)->constvalue); + get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), + &elmlen, &elmbyval, &elmalign); + deconstruct_array(arrayval, + ARR_ELEMTYPE(arrayval), + elmlen, elmbyval, elmalign, + &elemValues, &elemNulls, &numElems); + + memset(&arraycounts, 0, sizeof(arraycounts)); + + for (i = 0; i < numElems; i++) + { + GinQualCounts elemcounts; + + /* NULL can't match anything, so ignore, as the executor will */ + if (elemNulls[i]) + continue; + + /* Otherwise, apply extractQuery and get the actual term counts */ + memset(&elemcounts, 0, sizeof(elemcounts)); + + if (gincost_pattern(index, indexcol, clause_op, elemValues[i], + &elemcounts)) + { + /* We ignore array elements that are unsatisfiable patterns */ + numPossible++; + + if (elemcounts.haveFullScan) + { + /* + * Full index scan will be required. We treat this as if + * every key in the index had been listed in the query; is + * that reasonable? + */ + elemcounts.partialEntries = 0; + elemcounts.exactEntries = numIndexEntries; + elemcounts.searchEntries = numIndexEntries; + } + arraycounts.partialEntries += elemcounts.partialEntries; + arraycounts.exactEntries += elemcounts.exactEntries; + arraycounts.searchEntries += elemcounts.searchEntries; + } + } + + if (numPossible == 0) + { + /* No satisfiable patterns in the array */ + return false; + } + + /* + * Now add the averages to the global counts. This will give us an + * estimate of the average number of terms searched for in each indexscan, + * including contributions from both array and non-array quals. + */ + counts->partialEntries += arraycounts.partialEntries / numPossible; + counts->exactEntries += arraycounts.exactEntries / numPossible; + counts->searchEntries += arraycounts.searchEntries / numPossible; + + counts->arrayScans *= numPossible; + + return true; +} + /* * GIN has search behavior completely different from other index types */ @@ -6613,17 +6898,15 @@ gincostestimate(PG_FUNCTION_ARGS) numDataPages, numPendingPages, numEntries; - bool haveFullScan = false; - double partialEntriesInQuals = 0.0; - double searchEntriesInQuals = 0.0; - double exactEntriesInQuals = 0.0; + GinQualCounts counts; + bool matchPossible; double entryPagesFetched, dataPagesFetched, dataPagesFetchedBySel; double qual_op_cost, qual_arg_cost, spc_random_page_cost, - num_scans; + outer_scans; QualCost index_qual_cost; Relation indexRel; GinStatsData ginStats; @@ -6709,199 +6992,113 @@ gincostestimate(PG_FUNCTION_ARGS) /* * Examine quals to estimate number of search entries & partial matches */ + memset(&counts, 0, sizeof(counts)); + counts.arrayScans = 1; + matchPossible = true; + foreach(l, indexQuals) { RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); Expr *clause; - Node *leftop, - *rightop, - *operand; - Oid extractProcOid; - Oid clause_op; - int strategy_op; - Oid lefttype, - righttype; - int32 nentries = 0; - bool *partial_matches = NULL; - Pointer *extra_data = NULL; - bool *nullFlags = NULL; - int32 searchMode = GIN_SEARCH_MODE_DEFAULT; - int indexcol; Assert(IsA(rinfo, RestrictInfo)); clause = rinfo->clause; - Assert(IsA(clause, OpExpr)); - leftop = get_leftop(clause); - rightop = get_rightop(clause); - clause_op = ((OpExpr *) clause)->opno; - - if ((indexcol = find_index_column(leftop, index)) >= 0) - { - operand = rightop; - } - else if ((indexcol = find_index_column(rightop, index)) >= 0) - { - operand = leftop; - clause_op = get_commutator(clause_op); - } - else - { - elog(ERROR, "could not match index to operand"); - operand = NULL; /* keep compiler quiet */ - } - - if (IsA(operand, RelabelType)) - operand = (Node *) ((RelabelType *) operand)->arg; - - /* - * It's impossible to call extractQuery method for unknown operand. So - * unless operand is a Const we can't do much; just assume there will - * be one ordinary search entry from the operand at runtime. - */ - if (!IsA(operand, Const)) - { - searchEntriesInQuals++; - continue; - } - - /* If Const is null, there can be no matches */ - if (((Const *) operand)->constisnull) - { - *indexStartupCost = 0; - *indexTotalCost = 0; - *indexSelectivity = 0; - PG_RETURN_VOID(); - } - - /* - * Get the operator's strategy number and declared input data types - * within the index opfamily. (We don't need the latter, but we use - * get_op_opfamily_properties because it will throw error if it fails - * to find a matching pg_amop entry.) - */ - get_op_opfamily_properties(clause_op, index->opfamily[indexcol], false, - &strategy_op, &lefttype, &righttype); - - /* - * GIN always uses the "default" support functions, which are those - * with lefttype == righttype == the opclass' opcintype (see - * IndexSupportInitialize in relcache.c). - */ - extractProcOid = get_opfamily_proc(index->opfamily[indexcol], - index->opcintype[indexcol], - index->opcintype[indexcol], - GIN_EXTRACTQUERY_PROC); - - if (!OidIsValid(extractProcOid)) + if (IsA(clause, OpExpr)) { - /* should not happen; throw same error as index_getprocinfo */ - elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", - GIN_EXTRACTQUERY_PROC, indexcol + 1, - get_rel_name(index->indexoid)); + matchPossible = gincost_opexpr(index, + (OpExpr *) clause, + &counts); + if (!matchPossible) + break; } - - OidFunctionCall7(extractProcOid, - ((Const *) operand)->constvalue, - PointerGetDatum(&nentries), - UInt16GetDatum(strategy_op), - PointerGetDatum(&partial_matches), - PointerGetDatum(&extra_data), - PointerGetDatum(&nullFlags), - PointerGetDatum(&searchMode)); - - if (nentries <= 0 && searchMode == GIN_SEARCH_MODE_DEFAULT) + else if (IsA(clause, ScalarArrayOpExpr)) { - /* No match is possible */ - *indexStartupCost = 0; - *indexTotalCost = 0; - *indexSelectivity = 0; - PG_RETURN_VOID(); + matchPossible = gincost_scalararrayopexpr(index, + (ScalarArrayOpExpr *) clause, + numEntries, + &counts); + if (!matchPossible) + break; } else { - int32 i; - - for (i = 0; i < nentries; i++) - { - /* - * For partial match we haven't any information to estimate - * number of matched entries in index, so, we just estimate it - * as 100 - */ - if (partial_matches && partial_matches[i]) - partialEntriesInQuals += 100; - else - exactEntriesInQuals++; - - searchEntriesInQuals++; - } + /* shouldn't be anything else for a GIN index */ + elog(ERROR, "unsupported GIN indexqual type: %d", + (int) nodeTag(clause)); } + } - if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY) - { - /* Treat "include empty" like an exact-match item */ - exactEntriesInQuals++; - searchEntriesInQuals++; - } - else if (searchMode != GIN_SEARCH_MODE_DEFAULT) - { - /* It's GIN_SEARCH_MODE_ALL */ - haveFullScan = true; - } + /* Fall out if there were any provably-unsatisfiable quals */ + if (!matchPossible) + { + *indexStartupCost = 0; + *indexTotalCost = 0; + *indexSelectivity = 0; + PG_RETURN_VOID(); } - if (haveFullScan || indexQuals == NIL) + if (counts.haveFullScan || indexQuals == NIL) { /* * Full index scan will be required. We treat this as if every key in * the index had been listed in the query; is that reasonable? */ - searchEntriesInQuals = numEntries; + counts.partialEntries = 0; + counts.exactEntries = numEntries; + counts.searchEntries = numEntries; } /* Will we have more than one iteration of a nestloop scan? */ if (outer_rel != NULL && outer_rel->rows > 1) - num_scans = outer_rel->rows; + outer_scans = outer_rel->rows; else - num_scans = 1; + outer_scans = 1; /* - * cost to begin scan, first of all, pay attention to pending list. + * Compute cost to begin scan, first of all, pay attention to pending list. */ entryPagesFetched = numPendingPages; /* * Estimate number of entry pages read. We need to do - * searchEntriesInQuals searches. Use a power function as it should be, + * counts.searchEntries searches. Use a power function as it should be, * but tuples on leaf pages usually is much greater. Here we include all * searches in entry tree, including search of first entry in partial * match algorithm */ - entryPagesFetched += ceil(searchEntriesInQuals * rint(pow(numEntryPages, 0.15))); + entryPagesFetched += ceil(counts.searchEntries * rint(pow(numEntryPages, 0.15))); /* * Add an estimate of entry pages read by partial match algorithm. It's a * scan over leaf pages in entry tree. We haven't any useful stats here, * so estimate it as proportion. */ - entryPagesFetched += ceil(numEntryPages * partialEntriesInQuals / numEntries); + entryPagesFetched += ceil(numEntryPages * counts.partialEntries / numEntries); /* * Partial match algorithm reads all data pages before doing actual scan, - * so it's a startup cost. Again, we havn't any useful stats here, so, + * so it's a startup cost. Again, we haven't any useful stats here, so, * estimate it as proportion */ - dataPagesFetched = ceil(numDataPages * partialEntriesInQuals / numEntries); + dataPagesFetched = ceil(numDataPages * counts.partialEntries / numEntries); - /* calculate cache effects */ - if (num_scans > 1 || searchEntriesInQuals > 1) + /* + * Calculate cache effects if more than one scan due to nestloops or array + * quals. The result is pro-rated per nestloop scan, but the array qual + * factor shouldn't be pro-rated (compare genericcostestimate). + */ + if (outer_scans > 1 || counts.arrayScans > 1) { + entryPagesFetched *= outer_scans * counts.arrayScans; entryPagesFetched = index_pages_fetched(entryPagesFetched, (BlockNumber) numEntryPages, numEntryPages, root); + entryPagesFetched /= outer_scans; + dataPagesFetched *= outer_scans * counts.arrayScans; dataPagesFetched = index_pages_fetched(dataPagesFetched, (BlockNumber) numDataPages, numDataPages, root); + dataPagesFetched /= outer_scans; } /* @@ -6910,8 +7107,12 @@ gincostestimate(PG_FUNCTION_ARGS) */ *indexStartupCost = (entryPagesFetched + dataPagesFetched) * spc_random_page_cost; - /* cost to scan data pages for each exact (non-partial) matched entry */ - dataPagesFetched = ceil(numDataPages * exactEntriesInQuals / numEntries); + /* + * Now we compute the number of data pages fetched while the scan proceeds. + */ + + /* data pages scanned for each exact (non-partial) matched entry */ + dataPagesFetched = ceil(numDataPages * counts.exactEntries / numEntries); /* * Estimate number of data pages read, using selectivity estimation and @@ -6932,10 +7133,17 @@ gincostestimate(PG_FUNCTION_ARGS) dataPagesFetched = dataPagesFetchedBySel; } - if (num_scans > 1) + /* Account for cache effects, the same as above */ + if (outer_scans > 1 || counts.arrayScans > 1) + { + dataPagesFetched *= outer_scans * counts.arrayScans; dataPagesFetched = index_pages_fetched(dataPagesFetched, (BlockNumber) numDataPages, numDataPages, root); + dataPagesFetched /= outer_scans; + } + + /* And apply random_page_cost as the cost per page */ *indexTotalCost = *indexStartupCost + dataPagesFetched * spc_random_page_cost; diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index e1d7646c0d..9341dbe0d7 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -142,6 +142,12 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; 494 (1 row) +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); + count +------- + 158 +(1 row) + RESET enable_seqscan; DROP INDEX wowidx; CREATE INDEX wowidx ON test_tsvector USING gin (a); @@ -188,6 +194,12 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; 494 (1 row) +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); + count +------- + 158 +(1 row) + RESET enable_seqscan; INSERT INTO test_tsvector VALUES ('???', 'DFG:1A,2B,6C,10 FGH'); SELECT * FROM ts_stat('SELECT a FROM test_tsvector') ORDER BY ndoc DESC, nentry DESC, word LIMIT 10; diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index d261da2104..9fd12076ac 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -60,6 +60,7 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt'; SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); RESET enable_seqscan; @@ -76,6 +77,7 @@ SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt'; SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); RESET enable_seqscan; INSERT INTO test_tsvector VALUES ('???', 'DFG:1A,2B,6C,10 FGH'); -- GitLab