diff --git a/src/backend/gpopt/gpdbwrappers.cpp b/src/backend/gpopt/gpdbwrappers.cpp index 40525220d85caeb8f6e49e3a6fecd21a69a49487..c94dac639d268fc45aaaf26e24a953bdb777be2a 100644 --- a/src/backend/gpopt/gpdbwrappers.cpp +++ b/src/backend/gpopt/gpdbwrappers.cpp @@ -639,6 +639,28 @@ gpdb::FuncStrict return false; } +bool +gpdb::IsFuncNDVPreserving + ( + Oid funcid + ) +{ + // Given a function oid, return whether it's one of a list of NDV-preserving + // functions (estimated NDV of output is similar to that of the input) + switch (funcid) + { + // for now, these are the functions we consider for this optimization + case LOWER_OID: + case LTRIM_SPACE_OID: + case BTRIM_SPACE_OID: + case RTRIM_SPACE_OID: + case UPPER_OID: + return true; + default: + return false; + } +} + char gpdb::FuncStability ( @@ -2128,6 +2150,24 @@ gpdb::IsOpStrict return false; } +bool +gpdb::IsOpNDVPreserving + ( + Oid opno + ) +{ + switch (opno) + { + // for now, we consider only the concatenation op as NDV-preserving + // (note that we do additional checks later, e.g. col || 'const' is + // NDV-preserving, while col1 || col2 is not) + case OIDTextConcatenateOperator: + return true; + default: + return false; + } +} + void gpdb::GetOpInputTypes ( diff --git a/src/backend/gpopt/translate/CTranslatorRelcacheToDXL.cpp b/src/backend/gpopt/translate/CTranslatorRelcacheToDXL.cpp index fde60dc9f977c9d3ae3de7564a67a156a010ded1..6b6e0433c171c839fdb60cbcda456551bc23a342 100644 --- a/src/backend/gpopt/translate/CTranslatorRelcacheToDXL.cpp +++ b/src/backend/gpopt/translate/CTranslatorRelcacheToDXL.cpp @@ -1750,6 +1750,7 @@ CTranslatorRelcacheToDXL::RetrieveScOp } BOOL returns_null_on_null_input = gpdb::IsOpStrict(op_oid); + BOOL is_ndv_preserving = gpdb::IsOpNDVPreserving(op_oid); CMDIdGPDB *mdid_hash_opfamily = NULL; OID distr_opfamily = gpdb::GetCompatibleHashOpFamily(op_oid); @@ -1781,7 +1782,8 @@ CTranslatorRelcacheToDXL::RetrieveScOp returns_null_on_null_input, RetrieveScOpOpFamilies(mp, mdid), mdid_hash_opfamily, - mdid_legacy_hash_opfamily + mdid_legacy_hash_opfamily, + is_ndv_preserving ); return md_scalar_op; } @@ -1802,12 +1804,14 @@ CTranslatorRelcacheToDXL::LookupFuncProps IMDFunction::EFuncStbl *stability, // output: function stability IMDFunction::EFuncDataAcc *access, // output: function datya access BOOL *is_strict, // output: is function strict? + BOOL *is_ndv_preserving, // output: preserves NDVs of inputs BOOL *returns_set // output: does function return set? ) { GPOS_ASSERT(NULL != stability); GPOS_ASSERT(NULL != access); GPOS_ASSERT(NULL != is_strict); + GPOS_ASSERT(NULL != is_ndv_preserving); GPOS_ASSERT(NULL != returns_set); *stability = GetFuncStability(gpdb::FuncStability(func_oid)); @@ -1818,6 +1822,7 @@ CTranslatorRelcacheToDXL::LookupFuncProps *returns_set = gpdb::GetFuncRetset(func_oid); *is_strict = gpdb::FuncStrict(func_oid); + *is_ndv_preserving = gpdb::IsFuncNDVPreserving(func_oid); } @@ -1886,7 +1891,8 @@ CTranslatorRelcacheToDXL::RetrieveFunc IMDFunction::EFuncDataAcc access = IMDFunction::EfdaNoSQL; BOOL is_strict = true; BOOL returns_set = true; - LookupFuncProps(func_oid, &stability, &access, &is_strict, &returns_set); + BOOL is_ndv_preserving = true; + LookupFuncProps(func_oid, &stability, &access, &is_strict, &is_ndv_preserving, &returns_set); mdid->AddRef(); CMDFunctionGPDB *md_func = GPOS_NEW(mp) CMDFunctionGPDB @@ -1899,7 +1905,8 @@ CTranslatorRelcacheToDXL::RetrieveFunc returns_set, stability, access, - is_strict + is_strict, + is_ndv_preserving ); return md_func; diff --git a/src/backend/gporca/data/dxl/minidump/EquiJoinOnExpr-Supported.mdp b/src/backend/gporca/data/dxl/minidump/EquiJoinOnExpr-Supported.mdp new file mode 100644 index 0000000000000000000000000000000000000000..92b15155a0b740d55262f80fe7457555240fd729 --- /dev/null +++ b/src/backend/gporca/data/dxl/minidump/EquiJoinOnExpr-Supported.mdp @@ -0,0 +1,2071 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/backend/gporca/data/dxl/minidump/EquiJoinOnExpr-Unsupported.mdp b/src/backend/gporca/data/dxl/minidump/EquiJoinOnExpr-Unsupported.mdp new file mode 100644 index 0000000000000000000000000000000000000000..f00628df1d329dd37e3b1a26ef68b5177dd7c52d --- /dev/null +++ b/src/backend/gporca/data/dxl/minidump/EquiJoinOnExpr-Unsupported.mdp @@ -0,0 +1,1634 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/backend/gporca/data/dxl/minidump/InnerJoin-With-OuterRefs.mdp b/src/backend/gporca/data/dxl/minidump/InnerJoin-With-OuterRefs.mdp index c39067eb0ae2c4ce46aa7d35b7847728843ca6ce..a6b943ab6f1350d86498c477fa3e4e7d3b28faed 100644 --- a/src/backend/gporca/data/dxl/minidump/InnerJoin-With-OuterRefs.mdp +++ b/src/backend/gporca/data/dxl/minidump/InnerJoin-With-OuterRefs.mdp @@ -530,7 +530,7 @@ - + @@ -541,7 +541,7 @@ - + @@ -552,7 +552,7 @@ - + @@ -681,7 +681,7 @@ - + diff --git a/src/backend/gporca/data/dxl/minidump/OuterJoin-With-OuterRefs.mdp b/src/backend/gporca/data/dxl/minidump/OuterJoin-With-OuterRefs.mdp index 5c97d92c3bc47479c5d0f8ac3f7e916e363bd60f..0349e20623c51c1ae2953c513f6d4139f90d84ae 100644 --- a/src/backend/gporca/data/dxl/minidump/OuterJoin-With-OuterRefs.mdp +++ b/src/backend/gporca/data/dxl/minidump/OuterJoin-With-OuterRefs.mdp @@ -1,5 +1,26 @@ + + diff --git a/src/backend/gporca/data/dxl/parse_tests/q26-Metadata.xml b/src/backend/gporca/data/dxl/parse_tests/q26-Metadata.xml index d215b168e6fede89737317bfbb0adcf801ace279..6d6e051d23d415b805a62595a965e351694a01f2 100644 --- a/src/backend/gporca/data/dxl/parse_tests/q26-Metadata.xml +++ b/src/backend/gporca/data/dxl/parse_tests/q26-Metadata.xml @@ -173,7 +173,7 @@ - + @@ -185,14 +185,14 @@ - + - + diff --git a/src/backend/gporca/libgpopt/include/gpopt/base/CUtils.h b/src/backend/gporca/libgpopt/include/gpopt/base/CUtils.h index 19b490e4608dd7fdd13da3bddddf43420e61b33e..5567919714072f0820b97aeb8932defe54533d72 100644 --- a/src/backend/gporca/libgpopt/include/gpopt/base/CUtils.h +++ b/src/backend/gporca/libgpopt/include/gpopt/base/CUtils.h @@ -1077,11 +1077,9 @@ namespace gpopt static BOOL FCrossJoin(CExpression *pexpr); - // extract scalar ident column reference from scalar expression containing - // only one scalar ident in the tree - const static - CColRef *PcrExtractFromScExpression(CExpression *pexpr); - + // is this scalar expression an NDV-preserving function (used for join stats derivation) + static + BOOL IsExprNDVPreserving(CExpression *pexpr, const CColRef **underlying_colref); // search the given array of predicates for predicates with equality or IS NOT // DISTINCT FROM operators that has one side equal to the given expression diff --git a/src/backend/gporca/libgpopt/src/base/CUtils.cpp b/src/backend/gporca/libgpopt/src/base/CUtils.cpp index 00c2b91e476e79bfec52e881a2b28d95f5d69f95..51dc0b183ddb6c0a404fafcbd5f9d98c8ae11e77 100644 --- a/src/backend/gporca/libgpopt/src/base/CUtils.cpp +++ b/src/backend/gporca/libgpopt/src/base/CUtils.cpp @@ -5116,18 +5116,112 @@ CUtils::FCrossJoin return fCrossJoin; } -// extract scalar ident column reference from scalar expression containing -// only one scalar ident in the tree -const CColRef * -CUtils::PcrExtractFromScExpression +// Determine whether a scalar expression consists only of a scalar id and NDV-preserving +// functions plus casts. If so, return the corresponding CColRef. +BOOL +CUtils::IsExprNDVPreserving ( - CExpression *pexpr + CExpression *pexpr, + const CColRef **underlying_colref ) { - if (pexpr->DeriveUsedColumns()->Size() == 1) - return pexpr->DeriveUsedColumns()->PcrFirst(); + CExpression *curr_expr = pexpr; + + *underlying_colref = NULL; + + // go down the expression tree, visiting the child containing a scalar ident until + // we found the ident or until we found a non-NDV-preserving function (at which point there + // is no more need to check) + while (1) + { + COperator *pop = curr_expr->Pop(); + ULONG child_with_scalar_ident = 0; + + switch (pop->Eopid()) + { + case COperator::EopScalarIdent: + { + // we reached the bottom of the expression, return the ColRef + CScalarIdent *cr = CScalarIdent::PopConvert(pop); + + *underlying_colref = cr->Pcr(); + GPOS_ASSERT(1 == pexpr->DeriveUsedColumns()->Size()); + return true; + } + + case COperator::EopScalarCast: + // skip over casts + // Note: We might in the future investigate whether there are some casts + // that reduce NDVs by too much. Most, if not all, casts that have that potential are + // converted to functions, though. Examples: timestamp -> date, double precision -> int. + break; + + case COperator::EopScalarCoalesce: + { + // coalesce(col, const1, ... constn) is treated as an NDV-preserving function + for (ULONG c=1; cArity(); c++) + { + if (0 < (*curr_expr)[c]->DeriveUsedColumns()->Size()) + { + // this coalesce has a ColRef in the second or later arguments, assume for + // now that this doesn't preserve NDVs (we could add logic to support this case later) + return false; + } + } + break; + } + case COperator::EopScalarFunc: + { + // check whether the function is NDV-preserving + CMDAccessor *md_accessor = COptCtxt::PoctxtFromTLS()->Pmda(); + CScalarFunc *sf = CScalarFunc::PopConvert(pop); + const IMDFunction *pmdfunc = md_accessor->RetrieveFunc(sf->FuncMdId()); + + if (!pmdfunc->IsNDVPreserving() || 1 != curr_expr->Arity()) + { + return false; + } + break; + } + + case COperator::EopScalarOp: + { + CMDAccessor *md_accessor = COptCtxt::PoctxtFromTLS()->Pmda(); + CScalarOp *so = CScalarOp::PopConvert(pop); + const IMDScalarOp *pmdscop = md_accessor->RetrieveScOp(so->MdIdOp()); + + if (!pmdscop->IsNDVPreserving() || 2 != curr_expr->Arity()) + { + return false; + } - return NULL; + // col const is NDV-preserving, and so is const col + if (0 ==(*curr_expr)[1]->DeriveUsedColumns()->Size()) + { + // col const + child_with_scalar_ident = 0; + } + else if (0 ==(*curr_expr)[0]->DeriveUsedColumns()->Size()) + { + // const col + child_with_scalar_ident = 1; + } + else + { + // give up for now, both children reference a column, + // e.g. col1 col2 + return false; + } + break; + } + + default: + // anything else we see is considered non-NDV-preserving + return false; + } + + curr_expr = (*curr_expr)[child_with_scalar_ident]; + } } diff --git a/src/backend/gporca/libgpopt/src/operators/CLogicalDifference.cpp b/src/backend/gporca/libgpopt/src/operators/CLogicalDifference.cpp index bcd0f57b4de16af28d99dfd0d2ae17631b4a3804..3a76ce46e7aa3567f7b1eb968d570b21a36688f9 100644 --- a/src/backend/gporca/libgpopt/src/operators/CLogicalDifference.cpp +++ b/src/backend/gporca/libgpopt/src/operators/CLogicalDifference.cpp @@ -182,7 +182,8 @@ CLogicalDifference::PstatsDerive exprhdl, pexprScCond, output_colrefsets, - outer_refs + outer_refs, + true // is an LASJ ); IStatistics *LASJ_stats = outer_stats->CalcLASJoinStats ( diff --git a/src/backend/gporca/libgpopt/src/operators/CLogicalDifferenceAll.cpp b/src/backend/gporca/libgpopt/src/operators/CLogicalDifferenceAll.cpp index 0d0f43b411aeee233efe4dd53cbf4425511aa448..2ed49ed5d614040fff4cde1401b1d4c8ea0f61a9 100644 --- a/src/backend/gporca/libgpopt/src/operators/CLogicalDifferenceAll.cpp +++ b/src/backend/gporca/libgpopt/src/operators/CLogicalDifferenceAll.cpp @@ -179,7 +179,8 @@ CLogicalDifferenceAll::PstatsDerive exprhdl, pexprScCond, output_colrefsets, - outer_refs + outer_refs, + true // is an LASJ ); IStatistics *LASJ_stats = outer_stats->CalcLASJoinStats ( diff --git a/src/backend/gporca/libgpopt/src/operators/CLogicalIntersectAll.cpp b/src/backend/gporca/libgpopt/src/operators/CLogicalIntersectAll.cpp index 9477c4a53e40a5b7036361a6a2e0c8923e519ca5..85594fb6d951bb70e16e1066585b5b84119053cb 100644 --- a/src/backend/gporca/libgpopt/src/operators/CLogicalIntersectAll.cpp +++ b/src/backend/gporca/libgpopt/src/operators/CLogicalIntersectAll.cpp @@ -200,7 +200,8 @@ CLogicalIntersectAll::PstatsDerive exprhdl, pexprScCond, output_colrefsets, - outer_refs + outer_refs, + true // is a semi-join ); IStatistics *pstatsSemiJoin = CLogicalLeftSemiJoin::PstatsDerive(mp, join_preds_stats, outer_stats, inner_side_stats); diff --git a/src/backend/gporca/libgpopt/src/operators/CLogicalLeftAntiSemiJoin.cpp b/src/backend/gporca/libgpopt/src/operators/CLogicalLeftAntiSemiJoin.cpp index 33a9017c15796900d429c8c42e9dbabbeca4949f..497d40548322a1d49f2927ab9fbba38d49cbf978 100644 --- a/src/backend/gporca/libgpopt/src/operators/CLogicalLeftAntiSemiJoin.cpp +++ b/src/backend/gporca/libgpopt/src/operators/CLogicalLeftAntiSemiJoin.cpp @@ -149,7 +149,7 @@ CLogicalLeftAntiSemiJoin::PstatsDerive GPOS_ASSERT(Esp(exprhdl) > EspNone); IStatistics *outer_stats = exprhdl.Pstats(0); IStatistics *inner_side_stats = exprhdl.Pstats(1); - CStatsPredJoinArray *join_preds_stats = CStatsPredUtils::ExtractJoinStatsFromExprHandle(mp, exprhdl); + CStatsPredJoinArray *join_preds_stats = CStatsPredUtils::ExtractJoinStatsFromExprHandle(mp, exprhdl, true /*LASJ*/); IStatistics *pstatsLASJoin = outer_stats->CalcLASJoinStats ( mp, diff --git a/src/backend/gporca/libgpopt/src/operators/CLogicalLeftSemiJoin.cpp b/src/backend/gporca/libgpopt/src/operators/CLogicalLeftSemiJoin.cpp index aef81ad4946dda4f50ba3b586594ad1ea93021e1..42919cd645932323c47be60a8028ef8781623c7a 100644 --- a/src/backend/gporca/libgpopt/src/operators/CLogicalLeftSemiJoin.cpp +++ b/src/backend/gporca/libgpopt/src/operators/CLogicalLeftSemiJoin.cpp @@ -171,7 +171,7 @@ CLogicalLeftSemiJoin::PstatsDerive GPOS_ASSERT(Esp(exprhdl) > EspNone); IStatistics *outer_stats = exprhdl.Pstats(0); IStatistics *inner_side_stats = exprhdl.Pstats(1); - CStatsPredJoinArray *join_preds_stats = CStatsPredUtils::ExtractJoinStatsFromExprHandle(mp, exprhdl); + CStatsPredJoinArray *join_preds_stats = CStatsPredUtils::ExtractJoinStatsFromExprHandle(mp, exprhdl, true/*semi-join*/); IStatistics *pstatsSemiJoin = PstatsDerive(mp, join_preds_stats, outer_stats, inner_side_stats); join_preds_stats->Release(); diff --git a/src/backend/gporca/libnaucrates/include/naucrates/dxl/parser/CParseHandlerMDGPDBFunc.h b/src/backend/gporca/libnaucrates/include/naucrates/dxl/parser/CParseHandlerMDGPDBFunc.h index 3453b4ff224a83158e2a61c4f87791c903544c67..de1b05d02365750db76b8ed8b543cda9619546b2 100644 --- a/src/backend/gporca/libnaucrates/include/naucrates/dxl/parser/CParseHandlerMDGPDBFunc.h +++ b/src/backend/gporca/libnaucrates/include/naucrates/dxl/parser/CParseHandlerMDGPDBFunc.h @@ -60,6 +60,8 @@ namespace gpdxl // function strictness (i.e. whether func returns NULL on NULL input) BOOL m_is_strict; + + BOOL m_is_ndv_preserving; // private copy ctor CParseHandlerMDGPDBFunc(const CParseHandlerMDGPDBFunc &); diff --git a/src/backend/gporca/libnaucrates/include/naucrates/dxl/parser/CParseHandlerMDGPDBScalarOp.h b/src/backend/gporca/libnaucrates/include/naucrates/dxl/parser/CParseHandlerMDGPDBScalarOp.h index 792ab5433b1db2a360e258698917796c101a5b27..fdf220f26aac7b18879fae19fc5b6ff7d8bb729f 100644 --- a/src/backend/gporca/libnaucrates/include/naucrates/dxl/parser/CParseHandlerMDGPDBScalarOp.h +++ b/src/backend/gporca/libnaucrates/include/naucrates/dxl/parser/CParseHandlerMDGPDBScalarOp.h @@ -65,6 +65,9 @@ namespace gpdxl IMDId *m_mdid_hash_opfamily; IMDId *m_mdid_legacy_hash_opfamily; + // preserves NDVs of inputs + BOOL m_is_ndv_preserving; + // private copy ctor CParseHandlerMDGPDBScalarOp(const CParseHandlerMDGPDBScalarOp &); diff --git a/src/backend/gporca/libnaucrates/include/naucrates/dxl/xml/dxltokens.h b/src/backend/gporca/libnaucrates/include/naucrates/dxl/xml/dxltokens.h index 51b676f7cb5586874d321ed4bc163ed2df100dc7..8ab4c239d10597a165381b087323cc74f3759d4b 100644 --- a/src/backend/gporca/libnaucrates/include/naucrates/dxl/xml/dxltokens.h +++ b/src/backend/gporca/libnaucrates/include/naucrates/dxl/xml/dxltokens.h @@ -573,6 +573,7 @@ namespace gpdxl EdxltokenCmpOther, EdxltokenReturnsNullOnNullInput, + EdxltokenIsNDVPreserving, EdxltokenTriggers, EdxltokenTrigger, @@ -598,6 +599,7 @@ namespace gpdxl EdxltokenGPDBFuncResultTypeId, EdxltokenGPDBFuncReturnsSet, EdxltokenGPDBFuncStrict, + EdxltokenGPDBFuncNDVPreserving, EdxltokenGPDBCast, EdxltokenGPDBCastBinaryCoercible, diff --git a/src/backend/gporca/libnaucrates/include/naucrates/md/CMDFunctionGPDB.h b/src/backend/gporca/libnaucrates/include/naucrates/md/CMDFunctionGPDB.h index 8f24e11beacff17557c59bddedddf11856f7fcaa..8547f0a36c7ca0f40ed5ce4f55d3b3b782742f51 100644 --- a/src/backend/gporca/libnaucrates/include/naucrates/md/CMDFunctionGPDB.h +++ b/src/backend/gporca/libnaucrates/include/naucrates/md/CMDFunctionGPDB.h @@ -50,7 +50,7 @@ namespace gpmd IMDId *m_mdid_type_result; // output argument types - IMdIdArray *m_mdid_types_array; + IMdIdArray *m_mdid_types_array; // whether function returns a set of values BOOL m_returns_set; @@ -64,6 +64,10 @@ namespace gpmd // function strictness (i.e. whether func returns NULL on NULL input) BOOL m_is_strict; + // function result has very similar number of distinct values as the + // single function argument (used for cardinality estimation) + BOOL m_is_ndv_preserving; + // dxl token array for stability Edxltoken m_dxl_func_stability_array[EfsSentinel]; @@ -97,7 +101,8 @@ namespace gpmd BOOL ReturnsSet, EFuncStbl func_stability, EFuncDataAcc func_data_access, - BOOL is_strict + BOOL is_strict, + BOOL is_ndv_preserving ); virtual @@ -133,6 +138,12 @@ namespace gpmd return m_is_strict; } + virtual + BOOL IsNDVPreserving() const + { + return m_is_ndv_preserving; + } + // function stability virtual EFuncStbl GetFuncStability() const diff --git a/src/backend/gporca/libnaucrates/include/naucrates/md/CMDScalarOpGPDB.h b/src/backend/gporca/libnaucrates/include/naucrates/md/CMDScalarOpGPDB.h index 83a03cbd5cf3778b932f99754802559a2032086f..ab7577355330a4302b09aaab6561fa0e1f08ddd4 100644 --- a/src/backend/gporca/libnaucrates/include/naucrates/md/CMDScalarOpGPDB.h +++ b/src/backend/gporca/libnaucrates/include/naucrates/md/CMDScalarOpGPDB.h @@ -71,7 +71,7 @@ namespace gpmd // does operator return NULL when all inputs are NULL? BOOL m_returns_null_on_null_input; - + // operator classes this operator belongs to IMdIdArray *m_mdid_opfamilies_array; @@ -81,6 +81,10 @@ namespace gpmd // compatible legacy hash op family using legacy (cdbhash) opclass IMDId *m_mdid_legacy_hash_opfamily; + // does operator preserve the NDV of its input(s) + // (used for cardinality estimation) + BOOL m_is_ndv_preserving; + CMDScalarOpGPDB(const CMDScalarOpGPDB &); public: @@ -101,7 +105,8 @@ namespace gpmd BOOL returns_null_on_null_input, IMdIdArray *mdid_opfamilies_array, IMDId *m_mdid_hash_opfamily, - IMDId *mdid_legacy_hash_opfamily + IMDId *mdid_legacy_hash_opfamily, + BOOL is_ndv_preserving ); ~CMDScalarOpGPDB(); @@ -155,6 +160,10 @@ namespace gpmd virtual BOOL ReturnsNullOnNullInput() const; + // preserves NDVs of its inputs? + virtual + BOOL IsNDVPreserving() const; + // comparison type virtual IMDType::ECmpType ParseCmpType() const; diff --git a/src/backend/gporca/libnaucrates/include/naucrates/md/IMDFunction.h b/src/backend/gporca/libnaucrates/include/naucrates/md/IMDFunction.h index 8edc61170486a57d89af7420904b9c2ca1f45719..264a1da8695330ac84231004222c6aa86ca5f526 100644 --- a/src/backend/gporca/libnaucrates/include/naucrates/md/IMDFunction.h +++ b/src/backend/gporca/libnaucrates/include/naucrates/md/IMDFunction.h @@ -65,6 +65,10 @@ namespace gpmd virtual BOOL IsStrict() const = 0; + // does function preserve NDVs of input (for cardinality estimation) + virtual + BOOL IsNDVPreserving() const = 0; + // does function return a set of values virtual BOOL ReturnsSet() const = 0; diff --git a/src/backend/gporca/libnaucrates/include/naucrates/md/IMDScalarOp.h b/src/backend/gporca/libnaucrates/include/naucrates/md/IMDScalarOp.h index 229e8648bf8ae6541902b4a06b3fc3e02769addc..d364f0fde6793013a1a4a59b3262c84d8fdeb53c 100644 --- a/src/backend/gporca/libnaucrates/include/naucrates/md/IMDScalarOp.h +++ b/src/backend/gporca/libnaucrates/include/naucrates/md/IMDScalarOp.h @@ -75,6 +75,10 @@ namespace gpmd virtual BOOL ReturnsNullOnNullInput() const = 0; + // preserves NDVs of its inputs? + virtual + BOOL IsNDVPreserving() const = 0; + virtual IMDType::ECmpType ParseCmpType() const = 0; diff --git a/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPred.h b/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPred.h index eb9c6fb9fc227bd4ceae85a044d3b0063d615e6c..aa35eb8a7b5db60dd9452be04669da94b2a92981 100644 --- a/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPred.h +++ b/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPred.h @@ -55,9 +55,8 @@ namespace gpnaucrates EstatscmptINDF, // is not distinct from EstatscmptLike, // LIKE predicate comparison EstatscmptNotLike, // NOT LIKE predicate comparison - // NDV comparision for equality predicate on columns with functions, ex f(a) = b or a = f(b) - EstatscmptEqNDVOuter, // use Outer NDV on inner side also - EstatscmptEqNDVInner, // use Inner NDV on outer side also + // NDV comparison for equality predicate on columns with functions, ex f(a) = b or a = f(b) + EstatscmptEqNDV, EstatscmptOther }; diff --git a/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPredJoin.h b/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPredJoin.h index 9c7b43c3ecbbe6f4dc3c22e2328122dbf91ea392..836b0bb8e1838adee537aacf7b557edbf499e030 100644 --- a/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPredJoin.h +++ b/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPredJoin.h @@ -64,6 +64,11 @@ namespace gpnaucrates {} // accessors + BOOL HasValidColIdOuter() const + { + return gpos::ulong_max != m_colidOuter; + } + ULONG ColIdOuter() const { return m_colidOuter; @@ -75,6 +80,11 @@ namespace gpnaucrates return m_stats_cmp_type; } + BOOL HasValidColIdInner() const + { + return gpos::ulong_max != m_colidInner; + } + ULONG ColIdInner() const { return m_colidInner; diff --git a/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPredUtils.h b/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPredUtils.h index 955a352fc607ea71f630718c0ce279b5b70ddcb1..c5daa20e2da23ce6935cefa50951aafc037eb1d9 100644 --- a/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPredUtils.h +++ b/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatsPredUtils.h @@ -140,32 +140,40 @@ namespace gpopt static CStatsPred::EStatsCmpType GetStatsCmpType(IMDId *mdid); - // derive whether it is EstatscmptEqNDVInner or EstatscmptEqNDVOuter - static - CStatsPred::EStatsCmpType DeriveStatCmpEqNDVType ( ULONG left_index, ULONG right_index, BOOL left_is_null, BOOL right_is_null); - // helper function to extract statistics join filter from a given join predicate static CStatsPredJoin *ExtractJoinStatsFromJoinPred ( CMemoryPool *mp, CExpression *join_predicate_expr, - CColRefSetArray *join_output_col_refset, // array of output columns of join's relational inputs + CColRefSetArray *join_output_col_refset, // array of output columns of join's relational inputs CColRefSet *outer_refs, + BOOL is_semi_or_anti_join, CExpressionArray *unsupported_predicates_expr ); - // is the expression a comparison of scalar idents (or casted scalar idents). - // If so, extract relevant info. + // Is the expression a comparison of scalar idents (or casted scalar idents), + // or of other supported expressions? If so, extract relevant info. static - BOOL IsPredCmpColsOrIgnoreCast + BOOL IsJoinPredSupportedForStatsEstimation ( CExpression *expr, - const CColRef **col_ref1, + CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs + BOOL is_semi_or_anti_join, CStatsPred::EStatsCmpType *stats_pred_cmp_type, - const CColRef **col_ref2, - BOOL &left_is_null, - BOOL &right_is_null + const CColRef **col_ref_outer, + const CColRef **col_ref_inner + ); + + // find out which input expression refers only to the inner table and which + // refers only to the outer table, and return accordingly + static BOOL AssignExprsToOuterAndInner + ( + CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs + CExpression *expr_1, + CExpression *expr_2, + CExpression **outer_expr, + CExpression **inner_expr ); public: @@ -180,14 +188,20 @@ namespace gpopt ( CMemoryPool *mp, CExpression *scalar_expr, - CColRefSetArray *output_col_refset, // array of output columns of join's relational inputs + CColRefSetArray *output_col_refset, // array of output columns of join's relational inputs CColRefSet *outer_refs, + BOOL is_semi_or_anti_join, CStatsPred **unsupported_pred_stats ); // helper function to extract array of statistics join filter from an expression handle static - CStatsPredJoinArray *ExtractJoinStatsFromExprHandle(CMemoryPool *mp, CExpressionHandle &expr_handle); + CStatsPredJoinArray *ExtractJoinStatsFromExprHandle + ( + CMemoryPool *mp, + CExpressionHandle &expr_handle, + BOOL is_semi_or_anti_join + ); // helper function to extract array of statistics join filter from an expression static @@ -197,7 +211,8 @@ namespace gpopt CExpressionHandle &expr_handle, CExpression *scalar_expression, CColRefSetArray *output_col_refset, - CColRefSet *outer_refs + CColRefSet *outer_refs, + BOOL is_semi_or_anti_join ); // is the predicate a conjunctive or disjunctive predicate diff --git a/src/backend/gporca/libnaucrates/src/md/CMDFunctionGPDB.cpp b/src/backend/gporca/libnaucrates/src/md/CMDFunctionGPDB.cpp index ee1abb2149fae654223054c94a5f495c9a1311d3..360840eb9216f2ebaa98a588ad65a948e251aa91 100644 --- a/src/backend/gporca/libnaucrates/src/md/CMDFunctionGPDB.cpp +++ b/src/backend/gporca/libnaucrates/src/md/CMDFunctionGPDB.cpp @@ -38,7 +38,8 @@ CMDFunctionGPDB::CMDFunctionGPDB BOOL ReturnsSet, EFuncStbl func_stability, EFuncDataAcc func_data_access, - BOOL is_strict + BOOL is_strict, + BOOL is_ndv_preserving ) : m_mp(mp), @@ -49,7 +50,8 @@ CMDFunctionGPDB::CMDFunctionGPDB m_returns_set(ReturnsSet), m_func_stability(func_stability), m_func_data_access(func_data_access), - m_is_strict(is_strict) + m_is_strict(is_strict), + m_is_ndv_preserving(is_ndv_preserving) { GPOS_ASSERT(m_mdid->IsValid()); GPOS_ASSERT(EfsSentinel > func_stability); @@ -228,6 +230,7 @@ CMDFunctionGPDB::Serialize xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenGPDBFuncStability), GetFuncStabilityStr()); xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenGPDBFuncDataAccess), GetFuncDataAccessStr()); xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenGPDBFuncStrict), m_is_strict); + xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenGPDBFuncNDVPreserving), m_is_ndv_preserving); SerializeMDIdAsElem(xml_serializer, CDXLTokens::GetDXLTokenStr(EdxltokenGPDBFuncResultTypeId), m_mdid_type_result); diff --git a/src/backend/gporca/libnaucrates/src/md/CMDScalarOpGPDB.cpp b/src/backend/gporca/libnaucrates/src/md/CMDScalarOpGPDB.cpp index 0dde8ab3e47b76044b6007810bca4f310b6f0997..4a8012a4b386f29664f607f3ed89788275ab4a2f 100644 --- a/src/backend/gporca/libnaucrates/src/md/CMDScalarOpGPDB.cpp +++ b/src/backend/gporca/libnaucrates/src/md/CMDScalarOpGPDB.cpp @@ -43,7 +43,8 @@ CMDScalarOpGPDB::CMDScalarOpGPDB BOOL returns_null_on_null_input, IMdIdArray *mdid_opfamilies_array, IMDId *mdid_hash_opfamily, - IMDId *mdid_legacy_hash_opfamily + IMDId *mdid_legacy_hash_opfamily, + BOOL is_ndv_preserving ) : m_mp(mp), @@ -59,7 +60,8 @@ CMDScalarOpGPDB::CMDScalarOpGPDB m_returns_null_on_null_input(returns_null_on_null_input), m_mdid_opfamilies_array(mdid_opfamilies_array), m_mdid_hash_opfamily(mdid_hash_opfamily), - m_mdid_legacy_hash_opfamily(mdid_legacy_hash_opfamily) + m_mdid_legacy_hash_opfamily(mdid_legacy_hash_opfamily), + m_is_ndv_preserving(is_ndv_preserving) { GPOS_ASSERT(NULL != mdid_opfamilies_array); m_dxl_str = CDXLUtils::SerializeMDObj(m_mp, this, false /*fSerializeHeader*/, false /*indentation*/); @@ -236,6 +238,12 @@ CMDScalarOpGPDB::ReturnsNullOnNullInput() const } +BOOL +CMDScalarOpGPDB::IsNDVPreserving() const +{ + return m_is_ndv_preserving; +} + //--------------------------------------------------------------------------- // @function: // CMDScalarOpGPDB::ParseCmpType @@ -272,6 +280,7 @@ CMDScalarOpGPDB::Serialize xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenName), m_mdname->GetMDName()); xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenGPDBScalarOpCmpType), IMDType::GetCmpTypeStr(m_comparision_type)); xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenReturnsNullOnNullInput), m_returns_null_on_null_input); + xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenIsNDVPreserving), m_is_ndv_preserving); Edxltoken dxl_token_array[8] = { EdxltokenGPDBScalarOpLeftTypeId, EdxltokenGPDBScalarOpRightTypeId, diff --git a/src/backend/gporca/libnaucrates/src/parser/CParseHandlerMDGPDBFunc.cpp b/src/backend/gporca/libnaucrates/src/parser/CParseHandlerMDGPDBFunc.cpp index 15f402662254294766cf092178ea34f74e9e3797..bf6cb3c2cbe84b851ad1eea4d216986b650cbaef 100644 --- a/src/backend/gporca/libnaucrates/src/parser/CParseHandlerMDGPDBFunc.cpp +++ b/src/backend/gporca/libnaucrates/src/parser/CParseHandlerMDGPDBFunc.cpp @@ -105,6 +105,17 @@ CParseHandlerMDGPDBFunc::StartElement EdxltokenGPDBFunc ); + // parse whether func is NDV-preserving + m_is_ndv_preserving = CDXLOperatorFactory::ExtractConvertAttrValueToBool + ( + m_parse_handler_mgr->GetDXLMemoryManager(), + attrs, + EdxltokenGPDBFuncNDVPreserving, + EdxltokenGPDBFunc, + true, // optional + false // default is false + ); + // parse func stability property const XMLCh *xmlszStbl = CDXLOperatorFactory::ExtractAttrValue ( @@ -190,7 +201,8 @@ CParseHandlerMDGPDBFunc::EndElement m_returns_set, m_func_stability, m_func_data_access, - m_is_strict); + m_is_strict, + m_is_ndv_preserving); // deactivate handler m_parse_handler_mgr->DeactivateHandler(); diff --git a/src/backend/gporca/libnaucrates/src/parser/CParseHandlerMDGPDBScalarOp.cpp b/src/backend/gporca/libnaucrates/src/parser/CParseHandlerMDGPDBScalarOp.cpp index 7adeb74aacc05713d779618a3d5e068263188985..cd074b5063f83126c951b2f9d5cb3da0d4c82b2d 100644 --- a/src/backend/gporca/libnaucrates/src/parser/CParseHandlerMDGPDBScalarOp.cpp +++ b/src/backend/gporca/libnaucrates/src/parser/CParseHandlerMDGPDBScalarOp.cpp @@ -53,7 +53,8 @@ CParseHandlerMDGPDBScalarOp::CParseHandlerMDGPDBScalarOp m_comparision_type(IMDType::EcmptOther), m_returns_null_on_null_input(false), m_mdid_hash_opfamily(NULL), - m_mdid_legacy_hash_opfamily(NULL) + m_mdid_legacy_hash_opfamily(NULL), + m_is_ndv_preserving(false) { } @@ -122,6 +123,17 @@ CParseHandlerMDGPDBScalarOp::StartElement ); } + // ndv-preserving property is optional + m_is_ndv_preserving = CDXLOperatorFactory::ExtractConvertAttrValueToBool + ( + m_parse_handler_mgr->GetDXLMemoryManager(), + attrs, + EdxltokenIsNDVPreserving, + EdxltokenGPDBScalarOp, + true, // is optional + false // default value + ); + } else if (0 == XMLString::compareString(CDXLTokens::XmlstrToken(EdxltokenGPDBScalarOpLeftTypeId), element_local_name)) { @@ -292,7 +304,8 @@ CParseHandlerMDGPDBScalarOp::EndElement m_returns_null_on_null_input, mdid_opfamilies_array, m_mdid_hash_opfamily, - m_mdid_legacy_hash_opfamily + m_mdid_legacy_hash_opfamily, + m_is_ndv_preserving ) ; diff --git a/src/backend/gporca/libnaucrates/src/statistics/CJoinStatsProcessor.cpp b/src/backend/gporca/libnaucrates/src/statistics/CJoinStatsProcessor.cpp index 73c10fbec2664466c51fa489f8ef4f5825e54f9a..43324bc5a83b90720ca693b87ad0f0e99cbb4939 100644 --- a/src/backend/gporca/libnaucrates/src/statistics/CJoinStatsProcessor.cpp +++ b/src/backend/gporca/libnaucrates/src/statistics/CJoinStatsProcessor.cpp @@ -216,6 +216,7 @@ CJoinStatsProcessor::CalcAllJoinStats join_preds_available, output_colrefsets, outer_refs, + is_a_left_join, // left joins use an anti-semijoin internally &unsupported_pred_stats ); @@ -307,8 +308,11 @@ CJoinStatsProcessor::SetResultingJoinStats { CStatsPredJoin *join_stats = (*join_pred_stats_info)[i]; - (void) join_colids->ExchangeSet(join_stats->ColIdOuter()); - if (!semi_join) + if (join_stats->HasValidColIdOuter()) + { + (void) join_colids->ExchangeSet(join_stats->ColIdOuter()); + } + if (!semi_join && join_stats->HasValidColIdInner()) { (void) join_colids->ExchangeSet(join_stats->ColIdInner()); } @@ -331,30 +335,43 @@ CJoinStatsProcessor::SetResultingJoinStats for (ULONG i = 0; i < num_join_conds; i++) { CStatsPredJoin *pred_info = (*join_pred_stats_info)[i]; - CStatsPred::EStatsCmpType stats_cmp_type = pred_info->GetCmpType(); ULONG colid1 = pred_info->ColIdOuter(); ULONG colid2 = pred_info->ColIdInner(); GPOS_ASSERT(colid1 != colid2); - // find the histograms corresponding to the two columns - const CHistogram *outer_histogram = outer_stats->GetHistogram(colid1); - // are column id1 and 2 always in the order of outer inner? - const CHistogram *inner_histogram = inner_side_stats->GetHistogram(colid2); - GPOS_ASSERT(NULL != outer_histogram); - GPOS_ASSERT(NULL != inner_histogram); + const CHistogram *outer_histogram = NULL; + const CHistogram *inner_histogram = NULL; BOOL is_input_empty = CStatistics::IsEmptyJoin(outer_stats, inner_side_stats, IsLASJ); CDouble local_scale_factor(1.0); CHistogram *outer_histogram_after = NULL; CHistogram *inner_histogram_after = NULL; + + // find the histograms corresponding to the two columns + // are column id1 and 2 always in the order of outer inner? + if (pred_info->HasValidColIdOuter()) + { + outer_histogram = outer_stats->GetHistogram(colid1); + GPOS_ASSERT(NULL != outer_histogram); + } + if (pred_info->HasValidColIdInner()) + { + inner_histogram = inner_side_stats->GetHistogram(colid2); + GPOS_ASSERT(NULL != inner_histogram); + } + // When we have any form of equi join with join condition of type f(a)=b, // we calculate the NDV of such a join as NDV(b) ( from Selinger et al.) - if (CStatsPred::EstatscmptEqNDVOuter == stats_cmp_type) + if (NULL == outer_histogram) { - inner_histogram = outer_histogram; + GPOS_ASSERT(CStatsPred::EstatscmptEqNDV == pred_info->GetCmpType()); + outer_histogram = inner_histogram; + colid1 = colid2; } - else if (CStatsPred::EstatscmptEqNDVInner == stats_cmp_type) + else if (NULL == inner_histogram) { - outer_histogram = inner_histogram; + GPOS_ASSERT(CStatsPred::EstatscmptEqNDV == pred_info->GetCmpType()); + inner_histogram = outer_histogram; + colid2 = colid1; } JoinHistograms @@ -377,7 +394,7 @@ CJoinStatsProcessor::SetResultingJoinStats output_is_empty = JoinStatsAreEmpty(outer_stats->IsEmpty(), output_is_empty, outer_histogram, inner_histogram, outer_histogram_after, join_type); CStatisticsUtils::AddHistogram(mp, colid1, outer_histogram_after, result_col_hist_mapping); - if (!semi_join) + if (!semi_join && colid1 != colid2) { CStatisticsUtils::AddHistogram(mp, colid2, inner_histogram_after, result_col_hist_mapping); } @@ -385,6 +402,7 @@ CJoinStatsProcessor::SetResultingJoinStats GPOS_DELETE(outer_histogram_after); GPOS_DELETE(inner_histogram_after); + // remember which tables the columns came from, this info is used to combine scale factors CColumnFactory *col_factory = COptCtxt::PoctxtFromTLS()->Pcf(); CColRef *colref_outer = col_factory->LookupColRef(colid1); @@ -401,6 +419,9 @@ CJoinStatsProcessor::SetResultingJoinStats // there should only be two tables involved in a join condition // if the predicate is more complex (i.e. more than 2 tables involved in the predicate such as t1.a=t2.a+t3.a), // the mdid of the base table will be NULL: + // Note that we hash on the pointer to the Mdid, not the value of the Mdid, + // but we know that CColRef::GetMdidTable() will always return the same + // pointer for a given table. mdid_pair = GPOS_NEW(mp) IMdIdArray(mp, 2); mdid_outer->AddRef(); mdid_inner->AddRef(); diff --git a/src/backend/gporca/libnaucrates/src/statistics/CLeftOuterJoinStatsProcessor.cpp b/src/backend/gporca/libnaucrates/src/statistics/CLeftOuterJoinStatsProcessor.cpp index 398ed9ffa2e1fb8c0bed38e8acd0f5adbdbab466..5c7bbc162d333a2930a661b6012f94235b6fdca0 100644 --- a/src/backend/gporca/libnaucrates/src/statistics/CLeftOuterJoinStatsProcessor.cpp +++ b/src/backend/gporca/libnaucrates/src/statistics/CLeftOuterJoinStatsProcessor.cpp @@ -97,11 +97,14 @@ CLeftOuterJoinStatsProcessor::MakeLOJHistogram GPOS_ASSERT(NULL != inner_join_stats); // build a bitset with all outer child columns contributing to the join - CBitSet *outer_side_cols = GPOS_NEW(mp) CBitSet(mp); + CBitSet *outer_side_join_cols = GPOS_NEW(mp) CBitSet(mp); for (ULONG j = 0; j < join_preds_stats->Size(); j++) { CStatsPredJoin *join_stats = (*join_preds_stats)[j]; - (void) outer_side_cols->ExchangeSet(join_stats->ColIdOuter()); + if (join_stats->HasValidColIdOuter()) + { + (void) outer_side_join_cols->ExchangeSet(join_stats->ColIdOuter()); + } } // for the columns in the outer child, compute the buckets that do not contribute to the inner join @@ -129,7 +132,7 @@ CLeftOuterJoinStatsProcessor::MakeLOJHistogram const CHistogram *inner_join_histogram = inner_join_stats->GetHistogram(colid); GPOS_ASSERT(NULL != inner_join_histogram); - if (outer_side_cols->Get(colid)) + if (outer_side_join_cols->Get(colid)) { // add buckets from the outer histogram that do not contribute to the inner join const CHistogram *LASJ_histogram = LASJ_stats->GetHistogram(colid); @@ -167,7 +170,7 @@ CLeftOuterJoinStatsProcessor::MakeLOJHistogram // clean up inner_colids_with_stats->Release(); outer_colids_with_stats->Release(); - outer_side_cols->Release(); + outer_side_join_cols->Release(); return LOJ_histograms; } diff --git a/src/backend/gporca/libnaucrates/src/statistics/CLeftSemiJoinStatsProcessor.cpp b/src/backend/gporca/libnaucrates/src/statistics/CLeftSemiJoinStatsProcessor.cpp index e93489d11e3e9c5ba25ccd50f73bac90e23ac7c4..505f9a308031e0bf3419290f03fcc4536bbf92d7 100644 --- a/src/backend/gporca/libnaucrates/src/statistics/CLeftSemiJoinStatsProcessor.cpp +++ b/src/backend/gporca/libnaucrates/src/statistics/CLeftSemiJoinStatsProcessor.cpp @@ -34,8 +34,11 @@ CLeftSemiJoinStatsProcessor::CalcLSJoinStatsStatic ULongPtrArray *inner_colids = GPOS_NEW(mp) ULongPtrArray(mp); for (ULONG ul = 0; ul < length; ul++) { - ULONG colid = ((*join_preds_stats)[ul])->ColIdInner(); - inner_colids->Append(GPOS_NEW(mp) ULONG(colid)); + if ((*join_preds_stats)[ul]->HasValidColIdInner()) + { + ULONG colid = ((*join_preds_stats)[ul])->ColIdInner(); + inner_colids->Append(GPOS_NEW(mp) ULONG(colid)); + } } // dummy agg columns required for group by derivation diff --git a/src/backend/gporca/libnaucrates/src/statistics/CStatisticsUtils.cpp b/src/backend/gporca/libnaucrates/src/statistics/CStatisticsUtils.cpp index d0a3c6548520d097339097d1abbaaa537a143aba..10bd6fb51d5bd0ae829629b8f8c4b51be16e5ac3 100644 --- a/src/backend/gporca/libnaucrates/src/statistics/CStatisticsUtils.cpp +++ b/src/backend/gporca/libnaucrates/src/statistics/CStatisticsUtils.cpp @@ -1180,6 +1180,7 @@ CStatisticsUtils::DeriveStatsForDynamicScan scalar_expr, output_colrefs, outer_refs, + true, // semi-join &unsupported_pred_stats ); @@ -1863,9 +1864,7 @@ CStatisticsUtils::IsStatsCmpTypeNdvEq CStatsPred::EStatsCmpType stats_cmp_type ) { - return (CStatsPred::EstatscmptEqNDVOuter == stats_cmp_type || - CStatsPred::EstatscmptEqNDVInner == stats_cmp_type - ); + return (CStatsPred::EstatscmptEqNDV == stats_cmp_type); } //--------------------------------------------------------------------------- // @function: diff --git a/src/backend/gporca/libnaucrates/src/statistics/CStatsPredUtils.cpp b/src/backend/gporca/libnaucrates/src/statistics/CStatsPredUtils.cpp index ab32d7d4e7b544d299c827182447915a17b5f1e7..71580fda91ec7869068d8161c48fbe17860e064c 100644 --- a/src/backend/gporca/libnaucrates/src/statistics/CStatsPredUtils.cpp +++ b/src/backend/gporca/libnaucrates/src/statistics/CStatsPredUtils.cpp @@ -59,34 +59,29 @@ CStatsPredUtils::StatsCmpType CStatsPred::EStatsCmpType stats_cmp_type = CStatsPred::EstatscmptOther; + CWStringConst str_eq(GPOS_WSZ_LIT("=")); CWStringConst str_lt(GPOS_WSZ_LIT("<")); CWStringConst str_leq(GPOS_WSZ_LIT("<=")); - CWStringConst str_eq(GPOS_WSZ_LIT("=")); CWStringConst str_geq(GPOS_WSZ_LIT(">=")); CWStringConst str_gt(GPOS_WSZ_LIT(">")); CWStringConst str_neq(GPOS_WSZ_LIT("<>")); - if (str_opname->Equals(&str_lt)) + if (str_opname->Equals(&str_eq)) + { + stats_cmp_type = CStatsPred::EstatscmptEq; + } else if (str_opname->Equals(&str_lt)) { stats_cmp_type = CStatsPred::EstatscmptL; - } - if (str_opname->Equals(&str_leq)) + } else if (str_opname->Equals(&str_leq)) { stats_cmp_type = CStatsPred::EstatscmptLEq; - } - if (str_opname->Equals(&str_eq)) - { - stats_cmp_type = CStatsPred::EstatscmptEq; - } - if (str_opname->Equals(&str_geq)) + } else if (str_opname->Equals(&str_geq)) { stats_cmp_type = CStatsPred::EstatscmptGEq; - } - if (str_opname->Equals(&str_gt)) + } else if (str_opname->Equals(&str_gt)) { stats_cmp_type = CStatsPred::EstatscmptG; - } - if (str_opname->Equals(&str_neq)) + } else if (str_opname->Equals(&str_neq)) { stats_cmp_type = CStatsPred::EstatscmptNEq; } @@ -323,40 +318,69 @@ CStatsPredUtils::GetPredStats //--------------------------------------------------------------------------- -// @function: -// CStatsPredUtils::IsPredCmpColsOrIgnoreCast +// CStatsPredUtils::IsJoinPredSupportedForStatsEstimation // -// @doc: -// Is the expression a comparison of scalar ident or cast of a scalar ident? -// Extract relevant info. +// Given a join predicate , return whether this is a supported +// join predicate for cardinality estimation, and what method to use +// to build the join statistics. +// +// Also return ColRefs for those sides of the comparison predicate that +// can be used (either the entire histogram or just the NDV). +// +// Supported predicates: +// +// All of these must reference the outer table only on one side +// and the inner table only on the other side. // +// col1 col2 (op could be INDF, IDF, =, <, <=, >, >=, <>) +// col1 = p(col2) (p is an NDV-preserving function) +// p(col1) = p(col2) +// col1 = expr(col2...coln) +// p(col1) = expr(col2...coln) +// +// plus variations of the above, flipping sides and adding casts. +// Non-NDV-preserving expressions are not allowed on the inner side +// of semi and anti-semijoins because we need the NDV of the join column +// for those (LOJ stats are calculated using a semi-join, so the +// restriction affects those as well). +// +// For all but the first line above, we use an NDV-based stats method. //--------------------------------------------------------------------------- BOOL -CStatsPredUtils::IsPredCmpColsOrIgnoreCast +CStatsPredUtils::IsJoinPredSupportedForStatsEstimation ( CExpression *expr, - const CColRef **col_ref_left, + CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs + BOOL is_semi_or_anti_join, CStatsPred::EStatsCmpType *stats_pred_cmp_type, - const CColRef **col_ref_right, - BOOL &left_is_null, - BOOL &right_is_null + const CColRef **col_ref_outer, + const CColRef **col_ref_inner ) { - GPOS_ASSERT(NULL != col_ref_left); - GPOS_ASSERT(NULL != col_ref_right); + GPOS_ASSERT(NULL != col_ref_outer); + GPOS_ASSERT(NULL != col_ref_inner); + GPOS_ASSERT(NULL == *col_ref_outer); + GPOS_ASSERT(NULL == *col_ref_inner); COperator *expr_op = expr->Pop(); BOOL is_INDF = CPredicateUtils::FINDF(expr); BOOL is_IDF = CPredicateUtils::FIDF(expr); BOOL is_scalar_cmp = (COperator::EopScalarCmp == expr_op->Eopid()); + // left and right children of our join pred operator + CExpression *expr_left = NULL; + CExpression *expr_right = NULL; + + // initialize output parameters + *col_ref_inner = NULL; + *col_ref_outer = NULL; + if (!is_scalar_cmp && !is_INDF && !is_IDF) { + // an unsupported expression + *stats_pred_cmp_type = CStatsPred::EstatscmptOther; return false; } - CExpression *expr_left = NULL; - CExpression *expr_right = NULL; - if (is_INDF) { (*stats_pred_cmp_type) = CStatsPred::EstatscmptINDF; @@ -384,53 +408,135 @@ CStatsPredUtils::IsPredCmpColsOrIgnoreCast expr_right = (*expr)[1]; } - (*col_ref_left) = CCastUtils::PcrExtractFromScIdOrCastScId(expr_left); - (*col_ref_right) = CCastUtils::PcrExtractFromScIdOrCastScId(expr_right); + // expr_left and expr_right associated with the outer and inner tables + CExpression *assigned_expr_outer = NULL; + CExpression *assigned_expr_inner = NULL; - // if the equi join is of type f(a) = f(b) then it is unsupported stats comparison - // So, we fall back to default stats.(from Selinger et al.) - if (NULL == *col_ref_left && NULL == *col_ref_right) + if (!AssignExprsToOuterAndInner(output_col_refsets, expr_left, expr_right, &assigned_expr_outer, &assigned_expr_inner)) + { + // we are not dealing with a join predicate where one side of the operator + // refers to the outer table and the other side refers to the inner return false; + } + + // check whether left or right expressions are simple columns or casts + // of simple columns + (*col_ref_outer) = CCastUtils::PcrExtractFromScIdOrCastScId(assigned_expr_outer); + (*col_ref_inner) = CCastUtils::PcrExtractFromScIdOrCastScId(assigned_expr_inner); + + if (NULL != *col_ref_outer && NULL != *col_ref_inner) + { + // a simple predicate of the form col1 col2 (casts are allowed) + return true; + } - if (NULL == *col_ref_left || NULL == *col_ref_right) + // if the scalar cmp is of equality type, we may not have been able to extract + // the column references of scalar ident if they had any other expression than cast + // on top of them. + // in such cases, check if there is still a possibility to extract scalar ident, + // if there is more than one column reference on either side, this is unsupported + // If supported, mark the comparison as NDV-based + + if (*stats_pred_cmp_type == CStatsPred::EstatscmptEq) { - if (NULL == *col_ref_left) + BOOL outer_is_ndv_preserving = + (NULL != *col_ref_outer || CUtils::IsExprNDVPreserving(assigned_expr_outer, col_ref_outer)); + BOOL inner_is_ndv_preserving = + (NULL != *col_ref_inner || CUtils::IsExprNDVPreserving(assigned_expr_inner, col_ref_inner)); + + if (!outer_is_ndv_preserving && !inner_is_ndv_preserving) { - left_is_null = true; + // join pred of the form f(a) = f(b) with neither side NDV-preserving, this is not supported + return false; } - if (NULL == *col_ref_right) + if (is_semi_or_anti_join && !inner_is_ndv_preserving) { - right_is_null = true; + // non-NDV-preserving functions on the inner of a semi-join or anti-semijoin + // are not supported, we need the NDV of the inner join columns to calculate + // the stats + return false; } - // if the scalar cmp is of equality type, we may not have been able to extract - // the column referenes of scalar ident if they had any other expression than cast - // on top of them. - // in such cases, check if there is still a possibility to extract scalar ident, - // if there is more than one column reference on either side, this is unsupported - // If supported, mark the comparison as NDV-based + // a join predicate that involves an NDV-preserving function on at least one side, one of + // *col_ref_inner and *col_ref_outer may be NULL. If expr(...) is a non-NDV-preserving + // expression and p is an NDV-preserving function, then we can have one of the following + // (including variations with flipped sides and casts added): + // col1 = p(col2) (use max of both NDVs) + // p(col1) = p(col2) (use max of both NDVs) + // col1 = expr(col2...coln) (use NDV of col1) + // p(col1) = expr(col2...coln) (use NDV of col1) + *stats_pred_cmp_type = CStatsPred::EstatscmptEqNDV; + return true; + } - if (*stats_pred_cmp_type == CStatsPred::EstatscmptEq) - { - (*col_ref_left) = CUtils::PcrExtractFromScExpression(expr_left); - (*col_ref_right) = CUtils::PcrExtractFromScExpression(expr_right); - - if (NULL == *col_ref_left || NULL == *col_ref_right) - { - return false; - } + // failed to extract a scalar ident + return false; +} - return true; - } - // failed to extract a scalar ident + +BOOL +CStatsPredUtils::AssignExprsToOuterAndInner + ( + CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs + CExpression *expr_1, + CExpression *expr_2, + CExpression **outer_expr, + CExpression **inner_expr + ) +{ + // see also CPhysicalJoin::FPredKeysSeparated(), which returns similar info + CColRefSet *used_cols_1 = expr_1->DeriveUsedColumns(); + CColRefSet *used_cols_2 = expr_2->DeriveUsedColumns(); + ULONG child_index_1 = 0; + ULONG child_index_2 = 0; + + if (0 == used_cols_1->Size() || 0 == used_cols_2->Size()) + { + // one of the sides is a constant + return false; + } + + // try just one ColRef from each side and find the associated input table + child_index_1 = CUtils::UlPcrIndexContainingSet(output_col_refsets, used_cols_1->PcrAny()); + child_index_2 = CUtils::UlPcrIndexContainingSet(output_col_refsets, used_cols_2->PcrAny()); + + if (gpos::ulong_max == child_index_1 || gpos::ulong_max == child_index_2) + { + // the predicate refers to columns that are not available + // (predicate from NAry join that refers to tables not yet being processed) + return false; + } + if (child_index_1 == child_index_2) + { + // both sides refer to the same input table + return false; + } + + // we tried one ColRef above, now try all of them, if there are multiple + if ((1 < used_cols_1->Size() && !(*output_col_refsets)[child_index_1]->ContainsAll(used_cols_1)) || + (1 < used_cols_2->Size() && !(*output_col_refsets)[child_index_2]->ContainsAll(used_cols_2))) + { + // at least one of the sides refers to more than one input table return false; } + if (child_index_1 < child_index_2) + { + GPOS_ASSERT(0 == child_index_1 && 1 == child_index_2); + *outer_expr = expr_1; + *inner_expr = expr_2; + } + else + { + GPOS_ASSERT(0 == child_index_2 && 1 == child_index_1); + *outer_expr = expr_2; + *inner_expr = expr_1; + } + return true; } - //--------------------------------------------------------------------------- // @function: // CStatsPredUtils::ExtractPredStats @@ -1133,28 +1239,6 @@ CStatsPredUtils::GetStatsPredFromBoolExpr return GPOS_NEW(mp) CStatsPredPoint(colid, CStatsPred::EstatscmptEq, GPOS_NEW(mp) CPoint(datum)); } -CStatsPred::EStatsCmpType -CStatsPredUtils::DeriveStatCmpEqNDVType - ( - ULONG left_index, - ULONG right_index, - BOOL left_is_null, - BOOL right_is_null - ) -{ - GPOS_ASSERT(left_is_null || right_is_null); - - // given an equi join condition f(a) = b, if the func is on - // outer side, consider the NDV stats on inner - if ((left_is_null && (left_index < right_index)) || - (right_is_null && (right_index < left_index))) - { - return CStatsPred::EstatscmptEqNDVInner; - } - - // otherwise consider NDV stats on outer - return CStatsPred::EstatscmptEqNDVOuter; -} //--------------------------------------------------------------------------- // @function: // CStatsPredUtils::ExtractJoinStatsFromJoinPred @@ -1170,6 +1254,7 @@ CStatsPredUtils::ExtractJoinStatsFromJoinPred CExpression *join_pred_expr, CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs CColRefSet *outer_refs, + BOOL is_semi_or_anti_join, CExpressionArray *unsupported_expr_array ) { @@ -1184,16 +1269,23 @@ CStatsPredUtils::ExtractJoinStatsFromJoinPred return NULL; } - const CColRef *col_ref_left = NULL; - const CColRef *col_ref_right = NULL; - BOOL left_is_from_expr = false; - BOOL right_is_from_expr = false; + const CColRef *col_ref_outer = NULL; + const CColRef *col_ref_inner = NULL; CStatsPred::EStatsCmpType stats_cmp_type = CStatsPred::EstatscmptOther; - BOOL fSupportedScIdentComparison = IsPredCmpColsOrIgnoreCast(join_pred_expr, &col_ref_left, &stats_cmp_type, &col_ref_right, left_is_from_expr, right_is_from_expr); + BOOL fSupportedScIdentComparison = IsJoinPredSupportedForStatsEstimation + ( + join_pred_expr, + output_col_refsets, + is_semi_or_anti_join, + &stats_cmp_type, + &col_ref_outer, + &col_ref_inner + ); if (fSupportedScIdentComparison && CStatsPred::EstatscmptOther != stats_cmp_type) { - if (!IMDType::StatsAreComparable(col_ref_left->RetrieveType(), col_ref_right->RetrieveType())) + if (NULL != col_ref_outer && NULL != col_ref_inner && + !IMDType::StatsAreComparable(col_ref_outer->RetrieveType(), col_ref_inner->RetrieveType())) { // unsupported statistics comparison between the histogram boundaries of the columns join_pred_expr->AddRef(); @@ -1201,24 +1293,10 @@ CStatsPredUtils::ExtractJoinStatsFromJoinPred return NULL; } - ULONG index_left = CUtils::UlPcrIndexContainingSet(output_col_refsets, col_ref_left); - ULONG index_right = CUtils::UlPcrIndexContainingSet(output_col_refsets, col_ref_right); + ULONG outer_id = (NULL != col_ref_outer ? col_ref_outer->Id() : gpos::ulong_max); + ULONG inner_id = (NULL != col_ref_inner ? col_ref_inner->Id() : gpos::ulong_max); - if (left_is_from_expr || right_is_from_expr) - { - stats_cmp_type = DeriveStatCmpEqNDVType(index_left, index_right, left_is_from_expr, right_is_from_expr); - } - - if (gpos::ulong_max != index_left && gpos::ulong_max != index_right && - index_left != index_right) - { - if (index_left < index_right) - { - return GPOS_NEW(mp) CStatsPredJoin(col_ref_left->Id(), stats_cmp_type, col_ref_right->Id()); - } - - return GPOS_NEW(mp) CStatsPredJoin(col_ref_right->Id(), stats_cmp_type, col_ref_left->Id()); - } + return GPOS_NEW(mp) CStatsPredJoin(outer_id, stats_cmp_type, inner_id); } if (CColRefSet::FCovered(output_col_refsets, col_refset_used)) @@ -1248,6 +1326,7 @@ CStatsPredUtils::ExtractJoinStatsFromJoinPredArray CExpression *scalar_expr, CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs CColRefSet *outer_refs, + BOOL is_semi_or_antijoin, CStatsPred **unsupported_stats_pred_array ) { @@ -1270,6 +1349,7 @@ CStatsPredUtils::ExtractJoinStatsFromJoinPredArray predicate_expr, output_col_refsets, outer_refs, + is_semi_or_antijoin, unsupported_expr_array ); if (NULL != join_stats) @@ -1314,7 +1394,8 @@ CStatsPredUtils::ExtractJoinStatsFromExpr CExpressionHandle &expr_handle, CExpression *pexprScalarInput, CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs - CColRefSet *outer_refs + CColRefSet *outer_refs, + BOOL is_semi_or_anti_join ) { GPOS_ASSERT(NULL != output_col_refsets); @@ -1330,6 +1411,7 @@ CStatsPredUtils::ExtractJoinStatsFromExpr scalar_expr, output_col_refsets, outer_refs, + is_semi_or_anti_join, &unsupported_pred_stats ); @@ -1353,8 +1435,9 @@ CStatsPredUtils::ExtractJoinStatsFromExpr CStatsPredJoinArray * CStatsPredUtils::ExtractJoinStatsFromExprHandle ( - CMemoryPool *mp, - CExpressionHandle &expr_handle + CMemoryPool *mp, + CExpressionHandle &expr_handle, + BOOL is_semi_or_anti_join ) { // in case of subquery in join predicate, we return empty stats @@ -1376,7 +1459,15 @@ CStatsPredUtils::ExtractJoinStatsFromExprHandle CExpression *scalar_expr = expr_handle.PexprScalarChild(expr_handle.Arity() - 1); CColRefSet *outer_refs = expr_handle.DeriveOuterReferences(); - CStatsPredJoinArray *join_pred_stats = ExtractJoinStatsFromExpr(mp, expr_handle, scalar_expr, output_col_refsets, outer_refs); + CStatsPredJoinArray *join_pred_stats = ExtractJoinStatsFromExpr + ( + mp, + expr_handle, + scalar_expr, + output_col_refsets, + outer_refs, + is_semi_or_anti_join + ); // clean up output_col_refsets->Release(); diff --git a/src/backend/gporca/libnaucrates/src/xml/dxltokens.cpp b/src/backend/gporca/libnaucrates/src/xml/dxltokens.cpp index 1c23743dfe8dbe4b55c2674d0c8d598ea3062e8e..753d5276ff37fbb5d69cbc9233f70f8d06c6282d 100644 --- a/src/backend/gporca/libnaucrates/src/xml/dxltokens.cpp +++ b/src/backend/gporca/libnaucrates/src/xml/dxltokens.cpp @@ -613,6 +613,7 @@ CDXLTokens::Init {EdxltokenCmpOther, GPOS_WSZ_LIT("Other")}, {EdxltokenReturnsNullOnNullInput, GPOS_WSZ_LIT("ReturnsNullOnNullInput")}, + {EdxltokenIsNDVPreserving, GPOS_WSZ_LIT("IsNDVPreserving")}, {EdxltokenTriggers, GPOS_WSZ_LIT("Triggers")}, {EdxltokenTrigger, GPOS_WSZ_LIT("Trigger")}, @@ -638,7 +639,8 @@ CDXLTokens::Init {EdxltokenGPDBFuncResultTypeId, GPOS_WSZ_LIT("ResultType")}, {EdxltokenGPDBFuncReturnsSet, GPOS_WSZ_LIT("ReturnsSet")}, {EdxltokenGPDBFuncStrict, GPOS_WSZ_LIT("IsStrict")}, - + {EdxltokenGPDBFuncNDVPreserving, GPOS_WSZ_LIT("IsNDVPreserving")}, + {EdxltokenGPDBAgg, GPOS_WSZ_LIT("GPDBAgg")}, {EdxltokenGPDBIsAggOrdered, GPOS_WSZ_LIT("IsOrdered")}, {EdxltokenGPDBAggResultTypeId, GPOS_WSZ_LIT("ResultType")}, diff --git a/src/backend/gporca/server/CMakeLists.txt b/src/backend/gporca/server/CMakeLists.txt index f544dfdc1caccbd87748d1cceed71578640f1e9e..fb8c08dee6a8228306c6a8793516f672bd9ae2dd 100644 --- a/src/backend/gporca/server/CMakeLists.txt +++ b/src/backend/gporca/server/CMakeLists.txt @@ -141,7 +141,7 @@ SingleColumnHomogenousIndexOnRoot-AO SingleColumnHomogenousIndexOnRoot-HEAP; CStatsTest: Stat-Derivation-Leaf-Pattern MissingBoolColStats JoinColWithOnlyNDV UnsupportedStatsPredicate -StatsFilter-AnyWithNewColStats; +StatsFilter-AnyWithNewColStats EquiJoinOnExpr-Supported EquiJoinOnExpr-Unsupported; CICGMiscTest: BroadcastSkewedHashjoin OrderByNullsFirst ConvertHashToRandomSelect ConvertHashToRandomInsert HJN-DeeperOuter CTAS CTAS-Random CheckAsUser diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h index 93fb44710f2caa84e49a0bdab4f4ba91580ab3e6..92b90425fed9bc4b6c9c4a998fed7083654525c1 100644 --- a/src/include/catalog/pg_operator.h +++ b/src/include/catalog/pg_operator.h @@ -536,6 +536,7 @@ DATA(insert OID = 643 ( "<>" PGNSP PGUID b f f 19 19 16 643 93 namene neqsel DESCR("not equal"); DATA(insert OID = 654 ( "||" PGNSP PGUID b f f 25 25 25 0 0 textcat - - )); DESCR("concatenate"); +#define OIDTextConcatenateOperator 654 DATA(insert OID = 660 ( "<" PGNSP PGUID b f f 19 19 16 662 663 namelt scalarltsel scalarltjoinsel )); DESCR("less than"); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 34da66c1ceb40cc8adaeb2f58e5b245a72c97509..b389fcf25859affcac690585270dcecbfd258110 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -1912,8 +1912,10 @@ DATA(insert OID = 868 ( strpos PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 2 DESCR("position of substring"); DATA(insert OID = 870 ( lower PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 25 "25" _null_ _null_ _null_ _null_ _null_ lower _null_ _null_ _null_ )); DESCR("lowercase"); +#define LOWER_OID 870 DATA(insert OID = 871 ( upper PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 25 "25" _null_ _null_ _null_ _null_ _null_ upper _null_ _null_ _null_ )); DESCR("uppercase"); +#define UPPER_OID 871 DATA(insert OID = 872 ( initcap PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 25 "25" _null_ _null_ _null_ _null_ _null_ initcap _null_ _null_ _null_ )); DESCR("capitalize each word"); DATA(insert OID = 873 ( lpad PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 25 "25 23 25" _null_ _null_ _null_ _null_ _null_ lpad _null_ _null_ _null_ )); @@ -1936,14 +1938,17 @@ DATA(insert OID = 880 ( rpad PGNSP PGUID 14 1 0 0 0 f f f f t f i s 2 0 25 DESCR("right-pad string to length"); DATA(insert OID = 881 ( ltrim PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 25 "25" _null_ _null_ _null_ _null_ _null_ ltrim1 _null_ _null_ _null_ )); DESCR("trim spaces from left end of string"); +#define LTRIM_SPACE_OID 881 DATA(insert OID = 882 ( rtrim PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 25 "25" _null_ _null_ _null_ _null_ _null_ rtrim1 _null_ _null_ _null_ )); DESCR("trim spaces from right end of string"); +#define RTRIM_SPACE_OID 882 DATA(insert OID = 883 ( substr PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 25 "25 23" _null_ _null_ _null_ _null_ _null_ text_substr_no_len _null_ _null_ _null_ )); DESCR("extract portion of string"); DATA(insert OID = 884 ( btrim PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 25 "25 25" _null_ _null_ _null_ _null_ _null_ btrim _null_ _null_ _null_ )); DESCR("trim selected characters from both ends of string"); DATA(insert OID = 885 ( btrim PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 25 "25" _null_ _null_ _null_ _null_ _null_ btrim1 _null_ _null_ _null_ )); DESCR("trim spaces from both ends of string"); +#define BTRIM_SPACE_OID 885 DATA(insert OID = 936 ( substring PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 25 "25 23 23" _null_ _null_ _null_ _null_ _null_ text_substr _null_ _null_ _null_ )); DESCR("extract portion of string"); diff --git a/src/include/gpopt/gpdbwrappers.h b/src/include/gpopt/gpdbwrappers.h index e91b5d912cae7eecbd5ee9688a49d26cb524af90..509beeca752974acbf59f10d6e0c5337dcbe976b 100644 --- a/src/include/gpopt/gpdbwrappers.h +++ b/src/include/gpopt/gpdbwrappers.h @@ -204,6 +204,9 @@ namespace gpdb { // is the given function strict bool FuncStrict(Oid funcid); + // does this preserve the NDVs of its inputs? + bool IsFuncNDVPreserving(Oid funcid); + // stability property of given function char FuncStability(Oid funcid); @@ -480,6 +483,9 @@ namespace gpdb { // is the given operator strict bool IsOpStrict(Oid opno); + // does it preserve the NDVs of its inputs + bool IsOpNDVPreserving(Oid opno); + // get input types for a given operator void GetOpInputTypes(Oid opno, Oid *lefttype, Oid *righttype); diff --git a/src/include/gpopt/translate/CTranslatorRelcacheToDXL.h b/src/include/gpopt/translate/CTranslatorRelcacheToDXL.h index 3df15331a65d87dc4d03f60f4f3581d1258b9e4e..3aaa00ae63a8990aa3b686b4ec96ee4d4030674d 100644 --- a/src/include/gpopt/translate/CTranslatorRelcacheToDXL.h +++ b/src/include/gpopt/translate/CTranslatorRelcacheToDXL.h @@ -165,6 +165,7 @@ namespace gpdxl IMDFunction::EFuncStbl *stability, // output: function stability IMDFunction::EFuncDataAcc *access, // output: function data access BOOL *is_strict, // output: is function strict? + BOOL *is_ndv_preserving, // output: preserves NDVs of inputs BOOL *ReturnsSet // output: does function return set? ); diff --git a/src/test/regress/expected/gporca_optimizer.out b/src/test/regress/expected/gporca_optimizer.out index 47d45285bd2a62f33f9b9947b98aa30490c13c44..c6a6f7a1735829f19f756250777086146d89a9ed 100644 --- a/src/test/regress/expected/gporca_optimizer.out +++ b/src/test/regress/expected/gporca_optimizer.out @@ -12264,32 +12264,35 @@ WHERE L1.lid = int4in(unknownout(meta.load_id)); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry. QUERY PLAN ---------------------------------------------------------------------------------------------------------------------- - Result (cost=0.00..437.37 rows=134 width=8) + Result (cost=0.00..431.10 rows=1 width=8) Output: c, lid - -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.12 rows=134 width=8) + -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.08 rows=1 width=8) Output: c, lid - -> HashAggregate (cost=0.00..431.12 rows=134 width=8) + -> GroupAggregate (cost=0.00..431.08 rows=1 width=8) Output: c, lid Group Key: t55.c, t55.lid - -> Hash Join (cost=0.00..431.08 rows=134 width=8) + -> Sort (cost=0.00..431.08 rows=1 width=8) Output: c, lid - Hash Cond: (t55.lid = int4in(unknownout(('99')))) - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.02 rows=334 width=8) + Sort Key: t55.c, t55.lid + -> Hash Join (cost=0.00..431.08 rows=1 width=8) Output: c, lid - Hash Key: lid - -> Seq Scan on orca.t55 (cost=0.00..431.01 rows=334 width=8) + Hash Cond: (t55.lid = int4in(unknownout(('99')))) + -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.02 rows=334 width=8) Output: c, lid - -> Hash (cost=0.00..0.00 rows=1 width=8) - Output: ('99') - -> Result (cost=0.00..0.00 rows=1 width=8) + Hash Key: lid + -> Seq Scan on orca.t55 (cost=0.00..431.01 rows=334 width=8) + Output: c, lid + -> Hash (cost=0.00..0.00 rows=1 width=8) Output: ('99') -> Result (cost=0.00..0.00 rows=1 width=8) - Output: ('99'), int4in(unknownout(('99'))) - -> Result (cost=0.00..0.00 rows=1 width=1) - Output: '99' + Output: ('99') + -> Result (cost=0.00..0.00 rows=1 width=8) + Output: ('99'), int4in(unknownout(('99'))) + -> Result (cost=0.00..0.00 rows=1 width=1) + Output: '99' Optimizer: Pivotal Optimizer (GPORCA) Settings: optimizer=on, optimizer_cte_inlining_bound=1000, optimizer_join_order=query, optimizer_metadata_caching=on -(25 rows) +(28 rows) CREATE TABLE TP AS WITH META AS (SELECT '2020-01-01' AS VALID_DT, '99' AS LOAD_ID) diff --git a/src/test/regress/expected/join_optimizer.out b/src/test/regress/expected/join_optimizer.out index d9b58f9e53472b9cb059cc1d62ae25dc59c46dbe..8508edd15f5d7101577611f211cc585e179e92ee 100755 --- a/src/test/regress/expected/join_optimizer.out +++ b/src/test/regress/expected/join_optimizer.out @@ -4246,23 +4246,26 @@ select * from (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id)) on (xx.id = coalesce(yy.id)); QUERY PLAN ------------------------------------------------------- - Hash Left Join - Hash Cond: ((1) = COALESCE((1))) - -> Result - -> Hash - -> Gather Motion 3:1 (slice1; segments: 3) - -> Merge Full Join - Merge Cond: (unique1 = (1)) - -> Sort - Sort Key: unique1 - -> Seq Scan on tenk1 - -> Sort - Sort Key: (1) - -> Result +------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + -> Hash Left Join + Hash Cond: ((1) = COALESCE((1))) + -> Result + -> Result + -> Hash + -> Redistribute Motion 3:3 (slice2; segments: 3) + Hash Key: COALESCE((1)) + -> Merge Full Join + Merge Cond: (unique1 = (1)) + -> Sort + Sort Key: unique1 + -> Seq Scan on tenk1 + -> Sort + Sort Key: (1) -> Result - Optimizer: Pivotal Optimizer (GPORCA) version 3.83.0 -(15 rows) + -> Result + Optimizer: Pivotal Optimizer (GPORCA) +(18 rows) select * from (select 1 as id) as xx