未验证 提交 a4362cba 编写于 作者: H Hans Zeller 提交者: GitHub

Support "NDV-preserving" function and op property (#10247)

Orca uses this property for cardinality estimation of joins.
For example, a join predicate foo join bar on foo.a = upper(bar.b)
will have a cardinality estimate similar to foo join bar on foo.a = bar.b.

Other functions, like foo join bar on foo.a = substring(bar.b, 1, 1)
won't be treated that way, since they are more likely to have a greater
effect on join cardinalities.

Since this is specific to ORCA, we use logic in the translator to determine
whether a function or operator is NDV-preserving. Right now, we consider
a very limited set of operators, we may add more at a later time.

Let's assume that we join tables R and S and that f is a function or
expression that refers to a single column and does not preserve
NDVs. Let's also assume that p is a function or expression that also
refers to a single column and that does preserve NDVs:

join predicate       card. estimate                         comment
-------------------  -------------------------------------  -----------------------------
col1 = col2          |R| * |S| / max(NDV(col1), NDV(col2))  build an equi-join histogram
f(col1) = p(col2)    |R| * |S| / NDV(col2)                  use NDV-based estimation
f(col1) = col2       |R| * |S| / NDV(col2)                  use NDV-based estimation
p(col1) = col2       |R| * |S| / max(NDV(col1), NDV(col2))  use NDV-based estimation
p(col1) = p(col2)    |R| * |S| / max(NDV(col1), NDV(col2))  use NDV-based estimation
otherwise            |R| * |S| * 0.4                        this is an unsupported pred
Note that adding casts to these expressions is ok, as well as switching left and right side.

Here is a list of expressions that we currently treat as NDV-preserving:

coalesce(col, const)
col || const
lower(col)
trim(col)
upper(col)

One more note: We need the NDVs of the inner side of Semi and
Anti-joins for cardinality estimation, so only normal columns and
NDV-preserving functions are allowed in that case.

This is a port of these GPDB 5X and GPOrca PRs:
https://github.com/greenplum-db/gporca/pull/585
https://github.com/greenplum-db/gpdb/pull/10090

This is take 2, after reverting the first attempt due to a merge conflict that
caused a test to fail.
上级 560ffcb1
......@@ -639,6 +639,28 @@ gpdb::FuncStrict
return false;
}
bool
gpdb::IsFuncNDVPreserving
(
Oid funcid
)
{
// Given a function oid, return whether it's one of a list of NDV-preserving
// functions (estimated NDV of output is similar to that of the input)
switch (funcid)
{
// for now, these are the functions we consider for this optimization
case LOWER_OID:
case LTRIM_SPACE_OID:
case BTRIM_SPACE_OID:
case RTRIM_SPACE_OID:
case UPPER_OID:
return true;
default:
return false;
}
}
char
gpdb::FuncStability
(
......@@ -2128,6 +2150,24 @@ gpdb::IsOpStrict
return false;
}
bool
gpdb::IsOpNDVPreserving
(
Oid opno
)
{
switch (opno)
{
// for now, we consider only the concatenation op as NDV-preserving
// (note that we do additional checks later, e.g. col || 'const' is
// NDV-preserving, while col1 || col2 is not)
case OIDTextConcatenateOperator:
return true;
default:
return false;
}
}
void
gpdb::GetOpInputTypes
(
......
......@@ -1750,6 +1750,7 @@ CTranslatorRelcacheToDXL::RetrieveScOp
}
BOOL returns_null_on_null_input = gpdb::IsOpStrict(op_oid);
BOOL is_ndv_preserving = gpdb::IsOpNDVPreserving(op_oid);
CMDIdGPDB *mdid_hash_opfamily = NULL;
OID distr_opfamily = gpdb::GetCompatibleHashOpFamily(op_oid);
......@@ -1781,7 +1782,8 @@ CTranslatorRelcacheToDXL::RetrieveScOp
returns_null_on_null_input,
RetrieveScOpOpFamilies(mp, mdid),
mdid_hash_opfamily,
mdid_legacy_hash_opfamily
mdid_legacy_hash_opfamily,
is_ndv_preserving
);
return md_scalar_op;
}
......@@ -1802,12 +1804,14 @@ CTranslatorRelcacheToDXL::LookupFuncProps
IMDFunction::EFuncStbl *stability, // output: function stability
IMDFunction::EFuncDataAcc *access, // output: function datya access
BOOL *is_strict, // output: is function strict?
BOOL *is_ndv_preserving, // output: preserves NDVs of inputs
BOOL *returns_set // output: does function return set?
)
{
GPOS_ASSERT(NULL != stability);
GPOS_ASSERT(NULL != access);
GPOS_ASSERT(NULL != is_strict);
GPOS_ASSERT(NULL != is_ndv_preserving);
GPOS_ASSERT(NULL != returns_set);
*stability = GetFuncStability(gpdb::FuncStability(func_oid));
......@@ -1818,6 +1822,7 @@ CTranslatorRelcacheToDXL::LookupFuncProps
*returns_set = gpdb::GetFuncRetset(func_oid);
*is_strict = gpdb::FuncStrict(func_oid);
*is_ndv_preserving = gpdb::IsFuncNDVPreserving(func_oid);
}
......@@ -1886,7 +1891,8 @@ CTranslatorRelcacheToDXL::RetrieveFunc
IMDFunction::EFuncDataAcc access = IMDFunction::EfdaNoSQL;
BOOL is_strict = true;
BOOL returns_set = true;
LookupFuncProps(func_oid, &stability, &access, &is_strict, &returns_set);
BOOL is_ndv_preserving = true;
LookupFuncProps(func_oid, &stability, &access, &is_strict, &is_ndv_preserving, &returns_set);
mdid->AddRef();
CMDFunctionGPDB *md_func = GPOS_NEW(mp) CMDFunctionGPDB
......@@ -1899,7 +1905,8 @@ CTranslatorRelcacheToDXL::RetrieveFunc
returns_set,
stability,
access,
is_strict
is_strict,
is_ndv_preserving
);
return md_func;
......
......@@ -530,7 +530,7 @@
<dxl:Plan Id="0" SpaceSize="21">
<dxl:GatherMotion InputSegments="0,1" OutputSegments="-1">
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="641930.734375" Rows="1000.000000" Width="4"/>
<dxl:Cost StartupCost="0" TotalCost="3219.015625" Rows="1000.000000" Width="4"/>
</dxl:Properties>
<dxl:ProjList>
<dxl:ProjElem ColId="27" Alias="?column?">
......@@ -541,7 +541,7 @@
<dxl:SortingColumnList/>
<dxl:Result>
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="641927.781250" Rows="1000.000000" Width="4"/>
<dxl:Cost StartupCost="0" TotalCost="3216.062500" Rows="1000.000000" Width="4"/>
</dxl:Properties>
<dxl:ProjList>
<dxl:ProjElem ColId="27" Alias="?column?">
......@@ -552,7 +552,7 @@
</dxl:ParamList>
<dxl:Result>
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="1919.921875" Rows="80.000000" Width="4"/>
<dxl:Cost StartupCost="0" TotalCost="1608.203125" Rows="0.200000" Width="4"/>
</dxl:Properties>
<dxl:ProjList>
<dxl:ProjElem ColId="9" Alias="i">
......@@ -681,7 +681,7 @@
<dxl:OneTimeFilter/>
<dxl:TableScan>
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="641922.875000" Rows="1000.000000" Width="4"/>
<dxl:Cost StartupCost="0" TotalCost="3211.156250" Rows="1000.000000" Width="4"/>
</dxl:Properties>
<dxl:ProjList>
<dxl:ProjElem ColId="0" Alias="i">
......
<?xml version="1.0" encoding="UTF-8"?>
<dxl:DXLMessage xmlns:dxl="http://greenplum.com/dxl/2010/12/">
<dxl:Comment><![CDATA[
Test case: Left outer join with outer refs in join predicate
drop table if exists x,y,z;
create table x(i int, j int);
create table y(i int, j int);
create table z(i int, j int);
insert into x select i, i%2 from generate_series(1, 10) i;
insert into y select i, i%2 from generate_series(1, 10) i;
insert into z select i, i%2 from generate_series(1, 1000) i;
analyze x;
analyze y;
analyze z;
set optimizer_enumerate_plans = on;
set optimizer_segments = 2;
explain select (select x.i from x left outer join y on x.i+y.i = z.i) from z;
]]>
</dxl:Comment>
<dxl:Thread Id="0">
<dxl:OptimizerConfig>
<dxl:EnumeratorConfig Id="0" PlanSamples="0" CostThreshold="0"/>
......
......@@ -173,7 +173,7 @@
<dxl:SumAgg Mdid="0.0.0.0"/>
<dxl:CountAgg Mdid="0.2147.1.0"/>
</dxl:Type>
<dxl:GPDBScalarOp Mdid="0.97.1.0" Name="&lt;" ComparisonType="LT" ReturnsNullOnNullInput="true">
<dxl:GPDBScalarOp Mdid="0.97.1.0" Name="&lt;" ComparisonType="LT" ReturnsNullOnNullInput="true" IsNDVPreserving="false">
<dxl:LeftType Mdid="0.23.1.0"/>
<dxl:RightType Mdid="0.23.1.0"/>
<dxl:ResultType Mdid="0.16.1.0"/>
......@@ -185,14 +185,14 @@
<dxl:Opfamily Mdid="0.3027.1.0"/>
</dxl:Opfamilies>
</dxl:GPDBScalarOp>
<dxl:GPDBFunc Mdid="0.274.1.0" Name="timeofday" ReturnsSet="false" Stability="Volatile" DataAccess="NoSQL" IsStrict="true">
<dxl:GPDBFunc Mdid="0.274.1.0" Name="timeofday" ReturnsSet="false" Stability="Volatile" DataAccess="NoSQL" IsStrict="true" IsNDVPreserving="false">
<dxl:ResultType Mdid="0.25.1.0"/>
</dxl:GPDBFunc>
<dxl:GPDBAgg Mdid="0.2101.1.0" Name="avg" IsSplittable="true" HashAggCapable="true">
<dxl:ResultType Mdid="0.1700.1.0"/>
<dxl:IntermediateResultType Mdid="0.17.1.0"/>
</dxl:GPDBAgg>
<dxl:GPDBFunc Mdid="0.17135.1.0" Name="fooro" ReturnsSet="true" Stability="Volatile" DataAccess="ReadsSQLData" IsStrict="false">
<dxl:GPDBFunc Mdid="0.17135.1.0" Name="fooro" ReturnsSet="true" Stability="Volatile" DataAccess="ReadsSQLData" IsStrict="false" IsNDVPreserving="false">
<dxl:ResultType Mdid="0.2249.1.0"/>
<dxl:OutputColumns TypeMdids="0.23.1.0,0.23.1.0"/>
</dxl:GPDBFunc>
......
......@@ -1077,11 +1077,9 @@ namespace gpopt
static
BOOL FCrossJoin(CExpression *pexpr);
// extract scalar ident column reference from scalar expression containing
// only one scalar ident in the tree
const static
CColRef *PcrExtractFromScExpression(CExpression *pexpr);
// is this scalar expression an NDV-preserving function (used for join stats derivation)
static
BOOL IsExprNDVPreserving(CExpression *pexpr, const CColRef **underlying_colref);
// search the given array of predicates for predicates with equality or IS NOT
// DISTINCT FROM operators that has one side equal to the given expression
......
......@@ -5116,18 +5116,112 @@ CUtils::FCrossJoin
return fCrossJoin;
}
// extract scalar ident column reference from scalar expression containing
// only one scalar ident in the tree
const CColRef *
CUtils::PcrExtractFromScExpression
// Determine whether a scalar expression consists only of a scalar id and NDV-preserving
// functions plus casts. If so, return the corresponding CColRef.
BOOL
CUtils::IsExprNDVPreserving
(
CExpression *pexpr
CExpression *pexpr,
const CColRef **underlying_colref
)
{
if (pexpr->DeriveUsedColumns()->Size() == 1)
return pexpr->DeriveUsedColumns()->PcrFirst();
CExpression *curr_expr = pexpr;
return NULL;
*underlying_colref = NULL;
// go down the expression tree, visiting the child containing a scalar ident until
// we found the ident or until we found a non-NDV-preserving function (at which point there
// is no more need to check)
while (1)
{
COperator *pop = curr_expr->Pop();
ULONG child_with_scalar_ident = 0;
switch (pop->Eopid())
{
case COperator::EopScalarIdent:
{
// we reached the bottom of the expression, return the ColRef
CScalarIdent *cr = CScalarIdent::PopConvert(pop);
*underlying_colref = cr->Pcr();
GPOS_ASSERT(1 == pexpr->DeriveUsedColumns()->Size());
return true;
}
case COperator::EopScalarCast:
// skip over casts
// Note: We might in the future investigate whether there are some casts
// that reduce NDVs by too much. Most, if not all, casts that have that potential are
// converted to functions, though. Examples: timestamp -> date, double precision -> int.
break;
case COperator::EopScalarCoalesce:
{
// coalesce(col, const1, ... constn) is treated as an NDV-preserving function
for (ULONG c=1; c<curr_expr->Arity(); c++)
{
if (0 < (*curr_expr)[c]->DeriveUsedColumns()->Size())
{
// this coalesce has a ColRef in the second or later arguments, assume for
// now that this doesn't preserve NDVs (we could add logic to support this case later)
return false;
}
}
break;
}
case COperator::EopScalarFunc:
{
// check whether the function is NDV-preserving
CMDAccessor *md_accessor = COptCtxt::PoctxtFromTLS()->Pmda();
CScalarFunc *sf = CScalarFunc::PopConvert(pop);
const IMDFunction *pmdfunc = md_accessor->RetrieveFunc(sf->FuncMdId());
if (!pmdfunc->IsNDVPreserving() || 1 != curr_expr->Arity())
{
return false;
}
break;
}
case COperator::EopScalarOp:
{
CMDAccessor *md_accessor = COptCtxt::PoctxtFromTLS()->Pmda();
CScalarOp *so = CScalarOp::PopConvert(pop);
const IMDScalarOp *pmdscop = md_accessor->RetrieveScOp(so->MdIdOp());
if (!pmdscop->IsNDVPreserving() || 2 != curr_expr->Arity())
{
return false;
}
// col <op> const is NDV-preserving, and so is const <op> col
if (0 ==(*curr_expr)[1]->DeriveUsedColumns()->Size())
{
// col <op> const
child_with_scalar_ident = 0;
}
else if (0 ==(*curr_expr)[0]->DeriveUsedColumns()->Size())
{
// const <op> col
child_with_scalar_ident = 1;
}
else
{
// give up for now, both children reference a column,
// e.g. col1 <op> col2
return false;
}
break;
}
default:
// anything else we see is considered non-NDV-preserving
return false;
}
curr_expr = (*curr_expr)[child_with_scalar_ident];
}
}
......
......@@ -182,7 +182,8 @@ CLogicalDifference::PstatsDerive
exprhdl,
pexprScCond,
output_colrefsets,
outer_refs
outer_refs,
true // is an LASJ
);
IStatistics *LASJ_stats = outer_stats->CalcLASJoinStats
(
......
......@@ -179,7 +179,8 @@ CLogicalDifferenceAll::PstatsDerive
exprhdl,
pexprScCond,
output_colrefsets,
outer_refs
outer_refs,
true // is an LASJ
);
IStatistics *LASJ_stats = outer_stats->CalcLASJoinStats
(
......
......@@ -200,7 +200,8 @@ CLogicalIntersectAll::PstatsDerive
exprhdl,
pexprScCond,
output_colrefsets,
outer_refs
outer_refs,
true // is a semi-join
);
IStatistics *pstatsSemiJoin = CLogicalLeftSemiJoin::PstatsDerive(mp, join_preds_stats, outer_stats, inner_side_stats);
......
......@@ -149,7 +149,7 @@ CLogicalLeftAntiSemiJoin::PstatsDerive
GPOS_ASSERT(Esp(exprhdl) > EspNone);
IStatistics *outer_stats = exprhdl.Pstats(0);
IStatistics *inner_side_stats = exprhdl.Pstats(1);
CStatsPredJoinArray *join_preds_stats = CStatsPredUtils::ExtractJoinStatsFromExprHandle(mp, exprhdl);
CStatsPredJoinArray *join_preds_stats = CStatsPredUtils::ExtractJoinStatsFromExprHandle(mp, exprhdl, true /*LASJ*/);
IStatistics *pstatsLASJoin = outer_stats->CalcLASJoinStats
(
mp,
......
......@@ -171,7 +171,7 @@ CLogicalLeftSemiJoin::PstatsDerive
GPOS_ASSERT(Esp(exprhdl) > EspNone);
IStatistics *outer_stats = exprhdl.Pstats(0);
IStatistics *inner_side_stats = exprhdl.Pstats(1);
CStatsPredJoinArray *join_preds_stats = CStatsPredUtils::ExtractJoinStatsFromExprHandle(mp, exprhdl);
CStatsPredJoinArray *join_preds_stats = CStatsPredUtils::ExtractJoinStatsFromExprHandle(mp, exprhdl, true/*semi-join*/);
IStatistics *pstatsSemiJoin = PstatsDerive(mp, join_preds_stats, outer_stats, inner_side_stats);
join_preds_stats->Release();
......
......@@ -61,6 +61,8 @@ namespace gpdxl
// function strictness (i.e. whether func returns NULL on NULL input)
BOOL m_is_strict;
BOOL m_is_ndv_preserving;
// private copy ctor
CParseHandlerMDGPDBFunc(const CParseHandlerMDGPDBFunc &);
......
......@@ -65,6 +65,9 @@ namespace gpdxl
IMDId *m_mdid_hash_opfamily;
IMDId *m_mdid_legacy_hash_opfamily;
// preserves NDVs of inputs
BOOL m_is_ndv_preserving;
// private copy ctor
CParseHandlerMDGPDBScalarOp(const CParseHandlerMDGPDBScalarOp &);
......
......@@ -573,6 +573,7 @@ namespace gpdxl
EdxltokenCmpOther,
EdxltokenReturnsNullOnNullInput,
EdxltokenIsNDVPreserving,
EdxltokenTriggers,
EdxltokenTrigger,
......@@ -598,6 +599,7 @@ namespace gpdxl
EdxltokenGPDBFuncResultTypeId,
EdxltokenGPDBFuncReturnsSet,
EdxltokenGPDBFuncStrict,
EdxltokenGPDBFuncNDVPreserving,
EdxltokenGPDBCast,
EdxltokenGPDBCastBinaryCoercible,
......
......@@ -64,6 +64,10 @@ namespace gpmd
// function strictness (i.e. whether func returns NULL on NULL input)
BOOL m_is_strict;
// function result has very similar number of distinct values as the
// single function argument (used for cardinality estimation)
BOOL m_is_ndv_preserving;
// dxl token array for stability
Edxltoken m_dxl_func_stability_array[EfsSentinel];
......@@ -97,7 +101,8 @@ namespace gpmd
BOOL ReturnsSet,
EFuncStbl func_stability,
EFuncDataAcc func_data_access,
BOOL is_strict
BOOL is_strict,
BOOL is_ndv_preserving
);
virtual
......@@ -133,6 +138,12 @@ namespace gpmd
return m_is_strict;
}
virtual
BOOL IsNDVPreserving() const
{
return m_is_ndv_preserving;
}
// function stability
virtual
EFuncStbl GetFuncStability() const
......
......@@ -81,6 +81,10 @@ namespace gpmd
// compatible legacy hash op family using legacy (cdbhash) opclass
IMDId *m_mdid_legacy_hash_opfamily;
// does operator preserve the NDV of its input(s)
// (used for cardinality estimation)
BOOL m_is_ndv_preserving;
CMDScalarOpGPDB(const CMDScalarOpGPDB &);
public:
......@@ -101,7 +105,8 @@ namespace gpmd
BOOL returns_null_on_null_input,
IMdIdArray *mdid_opfamilies_array,
IMDId *m_mdid_hash_opfamily,
IMDId *mdid_legacy_hash_opfamily
IMDId *mdid_legacy_hash_opfamily,
BOOL is_ndv_preserving
);
~CMDScalarOpGPDB();
......@@ -155,6 +160,10 @@ namespace gpmd
virtual
BOOL ReturnsNullOnNullInput() const;
// preserves NDVs of its inputs?
virtual
BOOL IsNDVPreserving() const;
// comparison type
virtual
IMDType::ECmpType ParseCmpType() const;
......
......@@ -65,6 +65,10 @@ namespace gpmd
virtual
BOOL IsStrict() const = 0;
// does function preserve NDVs of input (for cardinality estimation)
virtual
BOOL IsNDVPreserving() const = 0;
// does function return a set of values
virtual
BOOL ReturnsSet() const = 0;
......
......@@ -75,6 +75,10 @@ namespace gpmd
virtual
BOOL ReturnsNullOnNullInput() const = 0;
// preserves NDVs of its inputs?
virtual
BOOL IsNDVPreserving() const = 0;
virtual
IMDType::ECmpType ParseCmpType() const = 0;
......
......@@ -55,9 +55,8 @@ namespace gpnaucrates
EstatscmptINDF, // is not distinct from
EstatscmptLike, // LIKE predicate comparison
EstatscmptNotLike, // NOT LIKE predicate comparison
// NDV comparision for equality predicate on columns with functions, ex f(a) = b or a = f(b)
EstatscmptEqNDVOuter, // use Outer NDV on inner side also
EstatscmptEqNDVInner, // use Inner NDV on outer side also
// NDV comparison for equality predicate on columns with functions, ex f(a) = b or a = f(b)
EstatscmptEqNDV,
EstatscmptOther
};
......
......@@ -64,6 +64,11 @@ namespace gpnaucrates
{}
// accessors
BOOL HasValidColIdOuter() const
{
return gpos::ulong_max != m_colidOuter;
}
ULONG ColIdOuter() const
{
return m_colidOuter;
......@@ -75,6 +80,11 @@ namespace gpnaucrates
return m_stats_cmp_type;
}
BOOL HasValidColIdInner() const
{
return gpos::ulong_max != m_colidInner;
}
ULONG ColIdInner() const
{
return m_colidInner;
......
......@@ -140,10 +140,6 @@ namespace gpopt
static
CStatsPred::EStatsCmpType GetStatsCmpType(IMDId *mdid);
// derive whether it is EstatscmptEqNDVInner or EstatscmptEqNDVOuter
static
CStatsPred::EStatsCmpType DeriveStatCmpEqNDVType ( ULONG left_index, ULONG right_index, BOOL left_is_null, BOOL right_is_null);
// helper function to extract statistics join filter from a given join predicate
static
CStatsPredJoin *ExtractJoinStatsFromJoinPred
......@@ -152,20 +148,32 @@ namespace gpopt
CExpression *join_predicate_expr,
CColRefSetArray *join_output_col_refset, // array of output columns of join's relational inputs
CColRefSet *outer_refs,
BOOL is_semi_or_anti_join,
CExpressionArray *unsupported_predicates_expr
);
// is the expression a comparison of scalar idents (or casted scalar idents).
// If so, extract relevant info.
// Is the expression a comparison of scalar idents (or casted scalar idents),
// or of other supported expressions? If so, extract relevant info.
static
BOOL IsPredCmpColsOrIgnoreCast
BOOL IsJoinPredSupportedForStatsEstimation
(
CExpression *expr,
const CColRef **col_ref1,
CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs
BOOL is_semi_or_anti_join,
CStatsPred::EStatsCmpType *stats_pred_cmp_type,
const CColRef **col_ref2,
BOOL &left_is_null,
BOOL &right_is_null
const CColRef **col_ref_outer,
const CColRef **col_ref_inner
);
// find out which input expression refers only to the inner table and which
// refers only to the outer table, and return accordingly
static BOOL AssignExprsToOuterAndInner
(
CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs
CExpression *expr_1,
CExpression *expr_2,
CExpression **outer_expr,
CExpression **inner_expr
);
public:
......@@ -182,12 +190,18 @@ namespace gpopt
CExpression *scalar_expr,
CColRefSetArray *output_col_refset, // array of output columns of join's relational inputs
CColRefSet *outer_refs,
BOOL is_semi_or_anti_join,
CStatsPred **unsupported_pred_stats
);
// helper function to extract array of statistics join filter from an expression handle
static
CStatsPredJoinArray *ExtractJoinStatsFromExprHandle(CMemoryPool *mp, CExpressionHandle &expr_handle);
CStatsPredJoinArray *ExtractJoinStatsFromExprHandle
(
CMemoryPool *mp,
CExpressionHandle &expr_handle,
BOOL is_semi_or_anti_join
);
// helper function to extract array of statistics join filter from an expression
static
......@@ -197,7 +211,8 @@ namespace gpopt
CExpressionHandle &expr_handle,
CExpression *scalar_expression,
CColRefSetArray *output_col_refset,
CColRefSet *outer_refs
CColRefSet *outer_refs,
BOOL is_semi_or_anti_join
);
// is the predicate a conjunctive or disjunctive predicate
......
......@@ -38,7 +38,8 @@ CMDFunctionGPDB::CMDFunctionGPDB
BOOL ReturnsSet,
EFuncStbl func_stability,
EFuncDataAcc func_data_access,
BOOL is_strict
BOOL is_strict,
BOOL is_ndv_preserving
)
:
m_mp(mp),
......@@ -49,7 +50,8 @@ CMDFunctionGPDB::CMDFunctionGPDB
m_returns_set(ReturnsSet),
m_func_stability(func_stability),
m_func_data_access(func_data_access),
m_is_strict(is_strict)
m_is_strict(is_strict),
m_is_ndv_preserving(is_ndv_preserving)
{
GPOS_ASSERT(m_mdid->IsValid());
GPOS_ASSERT(EfsSentinel > func_stability);
......@@ -228,6 +230,7 @@ CMDFunctionGPDB::Serialize
xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenGPDBFuncStability), GetFuncStabilityStr());
xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenGPDBFuncDataAccess), GetFuncDataAccessStr());
xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenGPDBFuncStrict), m_is_strict);
xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenGPDBFuncNDVPreserving), m_is_ndv_preserving);
SerializeMDIdAsElem(xml_serializer, CDXLTokens::GetDXLTokenStr(EdxltokenGPDBFuncResultTypeId), m_mdid_type_result);
......
......@@ -43,7 +43,8 @@ CMDScalarOpGPDB::CMDScalarOpGPDB
BOOL returns_null_on_null_input,
IMdIdArray *mdid_opfamilies_array,
IMDId *mdid_hash_opfamily,
IMDId *mdid_legacy_hash_opfamily
IMDId *mdid_legacy_hash_opfamily,
BOOL is_ndv_preserving
)
:
m_mp(mp),
......@@ -59,7 +60,8 @@ CMDScalarOpGPDB::CMDScalarOpGPDB
m_returns_null_on_null_input(returns_null_on_null_input),
m_mdid_opfamilies_array(mdid_opfamilies_array),
m_mdid_hash_opfamily(mdid_hash_opfamily),
m_mdid_legacy_hash_opfamily(mdid_legacy_hash_opfamily)
m_mdid_legacy_hash_opfamily(mdid_legacy_hash_opfamily),
m_is_ndv_preserving(is_ndv_preserving)
{
GPOS_ASSERT(NULL != mdid_opfamilies_array);
m_dxl_str = CDXLUtils::SerializeMDObj(m_mp, this, false /*fSerializeHeader*/, false /*indentation*/);
......@@ -236,6 +238,12 @@ CMDScalarOpGPDB::ReturnsNullOnNullInput() const
}
BOOL
CMDScalarOpGPDB::IsNDVPreserving() const
{
return m_is_ndv_preserving;
}
//---------------------------------------------------------------------------
// @function:
// CMDScalarOpGPDB::ParseCmpType
......@@ -272,6 +280,7 @@ CMDScalarOpGPDB::Serialize
xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenName), m_mdname->GetMDName());
xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenGPDBScalarOpCmpType), IMDType::GetCmpTypeStr(m_comparision_type));
xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenReturnsNullOnNullInput), m_returns_null_on_null_input);
xml_serializer->AddAttribute(CDXLTokens::GetDXLTokenStr(EdxltokenIsNDVPreserving), m_is_ndv_preserving);
Edxltoken dxl_token_array[8] = {
EdxltokenGPDBScalarOpLeftTypeId, EdxltokenGPDBScalarOpRightTypeId,
......
......@@ -105,6 +105,17 @@ CParseHandlerMDGPDBFunc::StartElement
EdxltokenGPDBFunc
);
// parse whether func is NDV-preserving
m_is_ndv_preserving = CDXLOperatorFactory::ExtractConvertAttrValueToBool
(
m_parse_handler_mgr->GetDXLMemoryManager(),
attrs,
EdxltokenGPDBFuncNDVPreserving,
EdxltokenGPDBFunc,
true, // optional
false // default is false
);
// parse func stability property
const XMLCh *xmlszStbl = CDXLOperatorFactory::ExtractAttrValue
(
......@@ -190,7 +201,8 @@ CParseHandlerMDGPDBFunc::EndElement
m_returns_set,
m_func_stability,
m_func_data_access,
m_is_strict);
m_is_strict,
m_is_ndv_preserving);
// deactivate handler
m_parse_handler_mgr->DeactivateHandler();
......
......@@ -53,7 +53,8 @@ CParseHandlerMDGPDBScalarOp::CParseHandlerMDGPDBScalarOp
m_comparision_type(IMDType::EcmptOther),
m_returns_null_on_null_input(false),
m_mdid_hash_opfamily(NULL),
m_mdid_legacy_hash_opfamily(NULL)
m_mdid_legacy_hash_opfamily(NULL),
m_is_ndv_preserving(false)
{
}
......@@ -122,6 +123,17 @@ CParseHandlerMDGPDBScalarOp::StartElement
);
}
// ndv-preserving property is optional
m_is_ndv_preserving = CDXLOperatorFactory::ExtractConvertAttrValueToBool
(
m_parse_handler_mgr->GetDXLMemoryManager(),
attrs,
EdxltokenIsNDVPreserving,
EdxltokenGPDBScalarOp,
true, // is optional
false // default value
);
}
else if (0 == XMLString::compareString(CDXLTokens::XmlstrToken(EdxltokenGPDBScalarOpLeftTypeId), element_local_name))
{
......@@ -292,7 +304,8 @@ CParseHandlerMDGPDBScalarOp::EndElement
m_returns_null_on_null_input,
mdid_opfamilies_array,
m_mdid_hash_opfamily,
m_mdid_legacy_hash_opfamily
m_mdid_legacy_hash_opfamily,
m_is_ndv_preserving
)
;
......
......@@ -216,6 +216,7 @@ CJoinStatsProcessor::CalcAllJoinStats
join_preds_available,
output_colrefsets,
outer_refs,
is_a_left_join, // left joins use an anti-semijoin internally
&unsupported_pred_stats
);
......@@ -307,8 +308,11 @@ CJoinStatsProcessor::SetResultingJoinStats
{
CStatsPredJoin *join_stats = (*join_pred_stats_info)[i];
if (join_stats->HasValidColIdOuter())
{
(void) join_colids->ExchangeSet(join_stats->ColIdOuter());
if (!semi_join)
}
if (!semi_join && join_stats->HasValidColIdInner())
{
(void) join_colids->ExchangeSet(join_stats->ColIdInner());
}
......@@ -331,30 +335,43 @@ CJoinStatsProcessor::SetResultingJoinStats
for (ULONG i = 0; i < num_join_conds; i++)
{
CStatsPredJoin *pred_info = (*join_pred_stats_info)[i];
CStatsPred::EStatsCmpType stats_cmp_type = pred_info->GetCmpType();
ULONG colid1 = pred_info->ColIdOuter();
ULONG colid2 = pred_info->ColIdInner();
GPOS_ASSERT(colid1 != colid2);
// find the histograms corresponding to the two columns
const CHistogram *outer_histogram = outer_stats->GetHistogram(colid1);
// are column id1 and 2 always in the order of outer inner?
const CHistogram *inner_histogram = inner_side_stats->GetHistogram(colid2);
GPOS_ASSERT(NULL != outer_histogram);
GPOS_ASSERT(NULL != inner_histogram);
const CHistogram *outer_histogram = NULL;
const CHistogram *inner_histogram = NULL;
BOOL is_input_empty = CStatistics::IsEmptyJoin(outer_stats, inner_side_stats, IsLASJ);
CDouble local_scale_factor(1.0);
CHistogram *outer_histogram_after = NULL;
CHistogram *inner_histogram_after = NULL;
// find the histograms corresponding to the two columns
// are column id1 and 2 always in the order of outer inner?
if (pred_info->HasValidColIdOuter())
{
outer_histogram = outer_stats->GetHistogram(colid1);
GPOS_ASSERT(NULL != outer_histogram);
}
if (pred_info->HasValidColIdInner())
{
inner_histogram = inner_side_stats->GetHistogram(colid2);
GPOS_ASSERT(NULL != inner_histogram);
}
// When we have any form of equi join with join condition of type f(a)=b,
// we calculate the NDV of such a join as NDV(b) ( from Selinger et al.)
if (CStatsPred::EstatscmptEqNDVOuter == stats_cmp_type)
if (NULL == outer_histogram)
{
inner_histogram = outer_histogram;
GPOS_ASSERT(CStatsPred::EstatscmptEqNDV == pred_info->GetCmpType());
outer_histogram = inner_histogram;
colid1 = colid2;
}
else if (CStatsPred::EstatscmptEqNDVInner == stats_cmp_type)
else if (NULL == inner_histogram)
{
outer_histogram = inner_histogram;
GPOS_ASSERT(CStatsPred::EstatscmptEqNDV == pred_info->GetCmpType());
inner_histogram = outer_histogram;
colid2 = colid1;
}
JoinHistograms
......@@ -377,7 +394,7 @@ CJoinStatsProcessor::SetResultingJoinStats
output_is_empty = JoinStatsAreEmpty(outer_stats->IsEmpty(), output_is_empty, outer_histogram, inner_histogram, outer_histogram_after, join_type);
CStatisticsUtils::AddHistogram(mp, colid1, outer_histogram_after, result_col_hist_mapping);
if (!semi_join)
if (!semi_join && colid1 != colid2)
{
CStatisticsUtils::AddHistogram(mp, colid2, inner_histogram_after, result_col_hist_mapping);
}
......@@ -385,6 +402,7 @@ CJoinStatsProcessor::SetResultingJoinStats
GPOS_DELETE(outer_histogram_after);
GPOS_DELETE(inner_histogram_after);
// remember which tables the columns came from, this info is used to combine scale factors
CColumnFactory *col_factory = COptCtxt::PoctxtFromTLS()->Pcf();
CColRef *colref_outer = col_factory->LookupColRef(colid1);
......@@ -401,6 +419,9 @@ CJoinStatsProcessor::SetResultingJoinStats
// there should only be two tables involved in a join condition
// if the predicate is more complex (i.e. more than 2 tables involved in the predicate such as t1.a=t2.a+t3.a),
// the mdid of the base table will be NULL:
// Note that we hash on the pointer to the Mdid, not the value of the Mdid,
// but we know that CColRef::GetMdidTable() will always return the same
// pointer for a given table.
mdid_pair = GPOS_NEW(mp) IMdIdArray(mp, 2);
mdid_outer->AddRef();
mdid_inner->AddRef();
......
......@@ -97,11 +97,14 @@ CLeftOuterJoinStatsProcessor::MakeLOJHistogram
GPOS_ASSERT(NULL != inner_join_stats);
// build a bitset with all outer child columns contributing to the join
CBitSet *outer_side_cols = GPOS_NEW(mp) CBitSet(mp);
CBitSet *outer_side_join_cols = GPOS_NEW(mp) CBitSet(mp);
for (ULONG j = 0; j < join_preds_stats->Size(); j++)
{
CStatsPredJoin *join_stats = (*join_preds_stats)[j];
(void) outer_side_cols->ExchangeSet(join_stats->ColIdOuter());
if (join_stats->HasValidColIdOuter())
{
(void) outer_side_join_cols->ExchangeSet(join_stats->ColIdOuter());
}
}
// for the columns in the outer child, compute the buckets that do not contribute to the inner join
......@@ -129,7 +132,7 @@ CLeftOuterJoinStatsProcessor::MakeLOJHistogram
const CHistogram *inner_join_histogram = inner_join_stats->GetHistogram(colid);
GPOS_ASSERT(NULL != inner_join_histogram);
if (outer_side_cols->Get(colid))
if (outer_side_join_cols->Get(colid))
{
// add buckets from the outer histogram that do not contribute to the inner join
const CHistogram *LASJ_histogram = LASJ_stats->GetHistogram(colid);
......@@ -167,7 +170,7 @@ CLeftOuterJoinStatsProcessor::MakeLOJHistogram
// clean up
inner_colids_with_stats->Release();
outer_colids_with_stats->Release();
outer_side_cols->Release();
outer_side_join_cols->Release();
return LOJ_histograms;
}
......
......@@ -33,10 +33,13 @@ CLeftSemiJoinStatsProcessor::CalcLSJoinStatsStatic
// iterate over all inner columns and perform a group by to remove duplicates
ULongPtrArray *inner_colids = GPOS_NEW(mp) ULongPtrArray(mp);
for (ULONG ul = 0; ul < length; ul++)
{
if ((*join_preds_stats)[ul]->HasValidColIdInner())
{
ULONG colid = ((*join_preds_stats)[ul])->ColIdInner();
inner_colids->Append(GPOS_NEW(mp) ULONG(colid));
}
}
// dummy agg columns required for group by derivation
ULongPtrArray *aggs = GPOS_NEW(mp) ULongPtrArray(mp);
......
......@@ -1180,6 +1180,7 @@ CStatisticsUtils::DeriveStatsForDynamicScan
scalar_expr,
output_colrefs,
outer_refs,
true, // semi-join
&unsupported_pred_stats
);
......@@ -1863,9 +1864,7 @@ CStatisticsUtils::IsStatsCmpTypeNdvEq
CStatsPred::EStatsCmpType stats_cmp_type
)
{
return (CStatsPred::EstatscmptEqNDVOuter == stats_cmp_type ||
CStatsPred::EstatscmptEqNDVInner == stats_cmp_type
);
return (CStatsPred::EstatscmptEqNDV == stats_cmp_type);
}
//---------------------------------------------------------------------------
// @function:
......
......@@ -59,34 +59,29 @@ CStatsPredUtils::StatsCmpType
CStatsPred::EStatsCmpType stats_cmp_type = CStatsPred::EstatscmptOther;
CWStringConst str_eq(GPOS_WSZ_LIT("="));
CWStringConst str_lt(GPOS_WSZ_LIT("<"));
CWStringConst str_leq(GPOS_WSZ_LIT("<="));
CWStringConst str_eq(GPOS_WSZ_LIT("="));
CWStringConst str_geq(GPOS_WSZ_LIT(">="));
CWStringConst str_gt(GPOS_WSZ_LIT(">"));
CWStringConst str_neq(GPOS_WSZ_LIT("<>"));
if (str_opname->Equals(&str_lt))
if (str_opname->Equals(&str_eq))
{
stats_cmp_type = CStatsPred::EstatscmptEq;
} else if (str_opname->Equals(&str_lt))
{
stats_cmp_type = CStatsPred::EstatscmptL;
}
if (str_opname->Equals(&str_leq))
} else if (str_opname->Equals(&str_leq))
{
stats_cmp_type = CStatsPred::EstatscmptLEq;
}
if (str_opname->Equals(&str_eq))
{
stats_cmp_type = CStatsPred::EstatscmptEq;
}
if (str_opname->Equals(&str_geq))
} else if (str_opname->Equals(&str_geq))
{
stats_cmp_type = CStatsPred::EstatscmptGEq;
}
if (str_opname->Equals(&str_gt))
} else if (str_opname->Equals(&str_gt))
{
stats_cmp_type = CStatsPred::EstatscmptG;
}
if (str_opname->Equals(&str_neq))
} else if (str_opname->Equals(&str_neq))
{
stats_cmp_type = CStatsPred::EstatscmptNEq;
}
......@@ -323,40 +318,69 @@ CStatsPredUtils::GetPredStats
//---------------------------------------------------------------------------
// @function:
// CStatsPredUtils::IsPredCmpColsOrIgnoreCast
// CStatsPredUtils::IsJoinPredSupportedForStatsEstimation
//
// @doc:
// Is the expression a comparison of scalar ident or cast of a scalar ident?
// Extract relevant info.
// Given a join predicate <expr>, return whether this is a supported
// join predicate for cardinality estimation, and what method to use
// to build the join statistics.
//
// Also return ColRefs for those sides of the comparison predicate that
// can be used (either the entire histogram or just the NDV).
//
// Supported predicates:
//
// All of these must reference the outer table only on one side
// and the inner table only on the other side.
//
// col1 <op> col2 (op could be INDF, IDF, =, <, <=, >, >=, <>)
// col1 = p(col2) (p is an NDV-preserving function)
// p(col1) = p(col2)
// col1 = expr(col2...coln)
// p(col1) = expr(col2...coln)
//
// plus variations of the above, flipping sides and adding casts.
// Non-NDV-preserving expressions are not allowed on the inner side
// of semi and anti-semijoins because we need the NDV of the join column
// for those (LOJ stats are calculated using a semi-join, so the
// restriction affects those as well).
//
// For all but the first line above, we use an NDV-based stats method.
//---------------------------------------------------------------------------
BOOL
CStatsPredUtils::IsPredCmpColsOrIgnoreCast
CStatsPredUtils::IsJoinPredSupportedForStatsEstimation
(
CExpression *expr,
const CColRef **col_ref_left,
CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs
BOOL is_semi_or_anti_join,
CStatsPred::EStatsCmpType *stats_pred_cmp_type,
const CColRef **col_ref_right,
BOOL &left_is_null,
BOOL &right_is_null
const CColRef **col_ref_outer,
const CColRef **col_ref_inner
)
{
GPOS_ASSERT(NULL != col_ref_left);
GPOS_ASSERT(NULL != col_ref_right);
GPOS_ASSERT(NULL != col_ref_outer);
GPOS_ASSERT(NULL != col_ref_inner);
GPOS_ASSERT(NULL == *col_ref_outer);
GPOS_ASSERT(NULL == *col_ref_inner);
COperator *expr_op = expr->Pop();
BOOL is_INDF = CPredicateUtils::FINDF(expr);
BOOL is_IDF = CPredicateUtils::FIDF(expr);
BOOL is_scalar_cmp = (COperator::EopScalarCmp == expr_op->Eopid());
// left and right children of our join pred operator
CExpression *expr_left = NULL;
CExpression *expr_right = NULL;
// initialize output parameters
*col_ref_inner = NULL;
*col_ref_outer = NULL;
if (!is_scalar_cmp && !is_INDF && !is_IDF)
{
// an unsupported expression
*stats_pred_cmp_type = CStatsPred::EstatscmptOther;
return false;
}
CExpression *expr_left = NULL;
CExpression *expr_right = NULL;
if (is_INDF)
{
(*stats_pred_cmp_type) = CStatsPred::EstatscmptINDF;
......@@ -384,28 +408,30 @@ CStatsPredUtils::IsPredCmpColsOrIgnoreCast
expr_right = (*expr)[1];
}
(*col_ref_left) = CCastUtils::PcrExtractFromScIdOrCastScId(expr_left);
(*col_ref_right) = CCastUtils::PcrExtractFromScIdOrCastScId(expr_right);
// expr_left and expr_right associated with the outer and inner tables
CExpression *assigned_expr_outer = NULL;
CExpression *assigned_expr_inner = NULL;
// if the equi join is of type f(a) = f(b) then it is unsupported stats comparison
// So, we fall back to default stats.(from Selinger et al.)
if (NULL == *col_ref_left && NULL == *col_ref_right)
return false;
if (NULL == *col_ref_left || NULL == *col_ref_right)
{
if (NULL == *col_ref_left)
if (!AssignExprsToOuterAndInner(output_col_refsets, expr_left, expr_right, &assigned_expr_outer, &assigned_expr_inner))
{
left_is_null = true;
// we are not dealing with a join predicate where one side of the operator
// refers to the outer table and the other side refers to the inner
return false;
}
if (NULL == *col_ref_right)
// check whether left or right expressions are simple columns or casts
// of simple columns
(*col_ref_outer) = CCastUtils::PcrExtractFromScIdOrCastScId(assigned_expr_outer);
(*col_ref_inner) = CCastUtils::PcrExtractFromScIdOrCastScId(assigned_expr_inner);
if (NULL != *col_ref_outer && NULL != *col_ref_inner)
{
right_is_null = true;
// a simple predicate of the form col1 <op> col2 (casts are allowed)
return true;
}
// if the scalar cmp is of equality type, we may not have been able to extract
// the column referenes of scalar ident if they had any other expression than cast
// the column references of scalar ident if they had any other expression than cast
// on top of them.
// in such cases, check if there is still a possibility to extract scalar ident,
// if there is more than one column reference on either side, this is unsupported
......@@ -413,24 +439,104 @@ CStatsPredUtils::IsPredCmpColsOrIgnoreCast
if (*stats_pred_cmp_type == CStatsPred::EstatscmptEq)
{
(*col_ref_left) = CUtils::PcrExtractFromScExpression(expr_left);
(*col_ref_right) = CUtils::PcrExtractFromScExpression(expr_right);
BOOL outer_is_ndv_preserving =
(NULL != *col_ref_outer || CUtils::IsExprNDVPreserving(assigned_expr_outer, col_ref_outer));
BOOL inner_is_ndv_preserving =
(NULL != *col_ref_inner || CUtils::IsExprNDVPreserving(assigned_expr_inner, col_ref_inner));
if (!outer_is_ndv_preserving && !inner_is_ndv_preserving)
{
// join pred of the form f(a) = f(b) with neither side NDV-preserving, this is not supported
return false;
}
if (NULL == *col_ref_left || NULL == *col_ref_right)
if (is_semi_or_anti_join && !inner_is_ndv_preserving)
{
// non-NDV-preserving functions on the inner of a semi-join or anti-semijoin
// are not supported, we need the NDV of the inner join columns to calculate
// the stats
return false;
}
// a join predicate that involves an NDV-preserving function on at least one side, one of
// *col_ref_inner and *col_ref_outer may be NULL. If expr(...) is a non-NDV-preserving
// expression and p is an NDV-preserving function, then we can have one of the following
// (including variations with flipped sides and casts added):
// col1 = p(col2) (use max of both NDVs)
// p(col1) = p(col2) (use max of both NDVs)
// col1 = expr(col2...coln) (use NDV of col1)
// p(col1) = expr(col2...coln) (use NDV of col1)
*stats_pred_cmp_type = CStatsPred::EstatscmptEqNDV;
return true;
}
// failed to extract a scalar ident
return false;
}
BOOL
CStatsPredUtils::AssignExprsToOuterAndInner
(
CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs
CExpression *expr_1,
CExpression *expr_2,
CExpression **outer_expr,
CExpression **inner_expr
)
{
// see also CPhysicalJoin::FPredKeysSeparated(), which returns similar info
CColRefSet *used_cols_1 = expr_1->DeriveUsedColumns();
CColRefSet *used_cols_2 = expr_2->DeriveUsedColumns();
ULONG child_index_1 = 0;
ULONG child_index_2 = 0;
if (0 == used_cols_1->Size() || 0 == used_cols_2->Size())
{
// one of the sides is a constant
return false;
}
// try just one ColRef from each side and find the associated input table
child_index_1 = CUtils::UlPcrIndexContainingSet(output_col_refsets, used_cols_1->PcrAny());
child_index_2 = CUtils::UlPcrIndexContainingSet(output_col_refsets, used_cols_2->PcrAny());
if (gpos::ulong_max == child_index_1 || gpos::ulong_max == child_index_2)
{
// the predicate refers to columns that are not available
// (predicate from NAry join that refers to tables not yet being processed)
return false;
}
if (child_index_1 == child_index_2)
{
// both sides refer to the same input table
return false;
}
// we tried one ColRef above, now try all of them, if there are multiple
if ((1 < used_cols_1->Size() && !(*output_col_refsets)[child_index_1]->ContainsAll(used_cols_1)) ||
(1 < used_cols_2->Size() && !(*output_col_refsets)[child_index_2]->ContainsAll(used_cols_2)))
{
// at least one of the sides refers to more than one input table
return false;
}
if (child_index_1 < child_index_2)
{
GPOS_ASSERT(0 == child_index_1 && 1 == child_index_2);
*outer_expr = expr_1;
*inner_expr = expr_2;
}
else
{
GPOS_ASSERT(0 == child_index_2 && 1 == child_index_1);
*outer_expr = expr_2;
*inner_expr = expr_1;
}
return true;
}
//---------------------------------------------------------------------------
// @function:
// CStatsPredUtils::ExtractPredStats
......@@ -1133,28 +1239,6 @@ CStatsPredUtils::GetStatsPredFromBoolExpr
return GPOS_NEW(mp) CStatsPredPoint(colid, CStatsPred::EstatscmptEq, GPOS_NEW(mp) CPoint(datum));
}
CStatsPred::EStatsCmpType
CStatsPredUtils::DeriveStatCmpEqNDVType
(
ULONG left_index,
ULONG right_index,
BOOL left_is_null,
BOOL right_is_null
)
{
GPOS_ASSERT(left_is_null || right_is_null);
// given an equi join condition f(a) = b, if the func is on
// outer side, consider the NDV stats on inner
if ((left_is_null && (left_index < right_index)) ||
(right_is_null && (right_index < left_index)))
{
return CStatsPred::EstatscmptEqNDVInner;
}
// otherwise consider NDV stats on outer
return CStatsPred::EstatscmptEqNDVOuter;
}
//---------------------------------------------------------------------------
// @function:
// CStatsPredUtils::ExtractJoinStatsFromJoinPred
......@@ -1170,6 +1254,7 @@ CStatsPredUtils::ExtractJoinStatsFromJoinPred
CExpression *join_pred_expr,
CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs
CColRefSet *outer_refs,
BOOL is_semi_or_anti_join,
CExpressionArray *unsupported_expr_array
)
{
......@@ -1184,16 +1269,23 @@ CStatsPredUtils::ExtractJoinStatsFromJoinPred
return NULL;
}
const CColRef *col_ref_left = NULL;
const CColRef *col_ref_right = NULL;
BOOL left_is_from_expr = false;
BOOL right_is_from_expr = false;
const CColRef *col_ref_outer = NULL;
const CColRef *col_ref_inner = NULL;
CStatsPred::EStatsCmpType stats_cmp_type = CStatsPred::EstatscmptOther;
BOOL fSupportedScIdentComparison = IsPredCmpColsOrIgnoreCast(join_pred_expr, &col_ref_left, &stats_cmp_type, &col_ref_right, left_is_from_expr, right_is_from_expr);
BOOL fSupportedScIdentComparison = IsJoinPredSupportedForStatsEstimation
(
join_pred_expr,
output_col_refsets,
is_semi_or_anti_join,
&stats_cmp_type,
&col_ref_outer,
&col_ref_inner
);
if (fSupportedScIdentComparison && CStatsPred::EstatscmptOther != stats_cmp_type)
{
if (!IMDType::StatsAreComparable(col_ref_left->RetrieveType(), col_ref_right->RetrieveType()))
if (NULL != col_ref_outer && NULL != col_ref_inner &&
!IMDType::StatsAreComparable(col_ref_outer->RetrieveType(), col_ref_inner->RetrieveType()))
{
// unsupported statistics comparison between the histogram boundaries of the columns
join_pred_expr->AddRef();
......@@ -1201,24 +1293,10 @@ CStatsPredUtils::ExtractJoinStatsFromJoinPred
return NULL;
}
ULONG index_left = CUtils::UlPcrIndexContainingSet(output_col_refsets, col_ref_left);
ULONG index_right = CUtils::UlPcrIndexContainingSet(output_col_refsets, col_ref_right);
if (left_is_from_expr || right_is_from_expr)
{
stats_cmp_type = DeriveStatCmpEqNDVType(index_left, index_right, left_is_from_expr, right_is_from_expr);
}
if (gpos::ulong_max != index_left && gpos::ulong_max != index_right &&
index_left != index_right)
{
if (index_left < index_right)
{
return GPOS_NEW(mp) CStatsPredJoin(col_ref_left->Id(), stats_cmp_type, col_ref_right->Id());
}
ULONG outer_id = (NULL != col_ref_outer ? col_ref_outer->Id() : gpos::ulong_max);
ULONG inner_id = (NULL != col_ref_inner ? col_ref_inner->Id() : gpos::ulong_max);
return GPOS_NEW(mp) CStatsPredJoin(col_ref_right->Id(), stats_cmp_type, col_ref_left->Id());
}
return GPOS_NEW(mp) CStatsPredJoin(outer_id, stats_cmp_type, inner_id);
}
if (CColRefSet::FCovered(output_col_refsets, col_refset_used))
......@@ -1248,6 +1326,7 @@ CStatsPredUtils::ExtractJoinStatsFromJoinPredArray
CExpression *scalar_expr,
CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs
CColRefSet *outer_refs,
BOOL is_semi_or_antijoin,
CStatsPred **unsupported_stats_pred_array
)
{
......@@ -1270,6 +1349,7 @@ CStatsPredUtils::ExtractJoinStatsFromJoinPredArray
predicate_expr,
output_col_refsets,
outer_refs,
is_semi_or_antijoin,
unsupported_expr_array
);
if (NULL != join_stats)
......@@ -1314,7 +1394,8 @@ CStatsPredUtils::ExtractJoinStatsFromExpr
CExpressionHandle &expr_handle,
CExpression *pexprScalarInput,
CColRefSetArray *output_col_refsets, // array of output columns of join's relational inputs
CColRefSet *outer_refs
CColRefSet *outer_refs,
BOOL is_semi_or_anti_join
)
{
GPOS_ASSERT(NULL != output_col_refsets);
......@@ -1330,6 +1411,7 @@ CStatsPredUtils::ExtractJoinStatsFromExpr
scalar_expr,
output_col_refsets,
outer_refs,
is_semi_or_anti_join,
&unsupported_pred_stats
);
......@@ -1354,7 +1436,8 @@ CStatsPredJoinArray *
CStatsPredUtils::ExtractJoinStatsFromExprHandle
(
CMemoryPool *mp,
CExpressionHandle &expr_handle
CExpressionHandle &expr_handle,
BOOL is_semi_or_anti_join
)
{
// in case of subquery in join predicate, we return empty stats
......@@ -1376,7 +1459,15 @@ CStatsPredUtils::ExtractJoinStatsFromExprHandle
CExpression *scalar_expr = expr_handle.PexprScalarChild(expr_handle.Arity() - 1);
CColRefSet *outer_refs = expr_handle.DeriveOuterReferences();
CStatsPredJoinArray *join_pred_stats = ExtractJoinStatsFromExpr(mp, expr_handle, scalar_expr, output_col_refsets, outer_refs);
CStatsPredJoinArray *join_pred_stats = ExtractJoinStatsFromExpr
(
mp,
expr_handle,
scalar_expr,
output_col_refsets,
outer_refs,
is_semi_or_anti_join
);
// clean up
output_col_refsets->Release();
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册