提交 ecefcc1c 编写于 作者: A Ashuka Xue

Allow stats estimation for text-like types only for histograms containing singleton buckets

In commit `Improve statistics calculation for exprs like "var = ANY
(ARRAY[...])"`, we improve performance in cardinality estimation for
ArrayCmp. However, it caused ArrayCmp expressions with text-like types
to default to NDV based cardinality estimations in spite of present and
valid histograms.

This commit re-enables using histograms for text-like types provided it
is safe to do so.

Removed because non-singleton buckets for text is not valid:
- src/backend/gporca/data/dxl/minidump/CTE-12.mdp
- src/backend/gporca/data/dxl/statistics/Join-Statistics-Text-Input.xml
- src/backend/gporca/data/dxl/statistics/Join-Statistics-Text-Output.xml
Co-authored-by: NAshuka Xue <axue@pivotal.io>
Co-authored-by: NShreedhar Hardikar <shardikar@pivotal.io>
上级 87c905e9
因为 它太大了无法显示 source diff 。你可以改为 查看blob
......@@ -1011,7 +1011,7 @@
<dxl:Plan Id="0" SpaceSize="1">
<dxl:GatherMotion InputSegments="0,1" OutputSegments="-1">
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="23860.852395" Rows="32015.655971" Width="165"/>
<dxl:Cost StartupCost="0" TotalCost="16123.176758" Rows="2.000000" Width="165"/>
</dxl:Properties>
<dxl:ProjList>
<dxl:ProjElem ColId="0" Alias="p_partkey">
......@@ -1046,7 +1046,7 @@
<dxl:SortingColumnList/>
<dxl:TableScan>
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="21280.466050" Rows="32015.655971" Width="165"/>
<dxl:Cost StartupCost="0" TotalCost="16122.015625" Rows="2.000000" Width="165"/>
</dxl:Properties>
<dxl:ProjList>
<dxl:ProjElem ColId="0" Alias="p_partkey">
......
<?xml version="1.0" encoding="UTF-8"?>
<dxl:DXLMessage xmlns:dxl="http://greenplum.com/dxl/2010/12/">
<dxl:Statistics>
<dxl:DerivedRelationStats Rows="1000.000000">
<dxl:DerivedColumnStats ColId="0" Width="3.000000" NullFreq="0.000000" NdvRemain="0.000000" FreqRemain="0.000000" ColStatsMissing="false">
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABTE=" LintValue="161096236"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzEzNA==" LintValue="936018796"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzEzNA==" LintValue="936018796"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzE3MA==" LintValue="919208812"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzE3MA==" LintValue="919208812"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzIwNg==" LintValue="927122212"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzIwNg==" LintValue="927122212"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzI0Mg==" LintValue="910312228"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzI0Mg==" LintValue="910312228"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzI3OQ==" LintValue="918758188"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzI3OQ==" LintValue="918758188"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzMxNA==" LintValue="927105900"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzMxNA==" LintValue="927105900"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzM1MA==" LintValue="910295916"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzM1MA==" LintValue="910295916"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzM4Nw==" LintValue="893576036"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzM4Nw==" LintValue="893576036"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzQyMg==" LintValue="937050916"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzQyMg==" LintValue="937050916"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzQ1OQ==" LintValue="911942444"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzQ1OQ==" LintValue="911942444"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzQ5NQ==" LintValue="895132460"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzQ5NQ==" LintValue="895132460"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzUzMA==" LintValue="937034604"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzUzMA==" LintValue="937034604"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzU2Ng==" LintValue="920306532"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzU2Ng==" LintValue="920306532"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzYwMQ==" LintValue="928129828"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzYwMQ==" LintValue="928129828"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzYzOA==" LintValue="936575788"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzYzOA==" LintValue="936575788"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzY3NA==" LintValue="919765804"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzY3NA==" LintValue="919765804"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjcx" LintValue="890381100"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjcx" LintValue="890381100"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzc0Ng==" LintValue="911393636"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzc0Ng==" LintValue="911393636"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzc4Mg==" LintValue="894583652"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzc4Mg==" LintValue="894583652"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzgxOA==" LintValue="929760044"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzgxOA==" LintValue="929760044"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzg1NA==" LintValue="912950060"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzg1NA==" LintValue="912950060"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzg5MA==" LintValue="896140076"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzg5MA==" LintValue="896140076"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzkyNg==" LintValue="938132324"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzkyNg==" LintValue="938132324"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABzk2Mg==" LintValue="921322340"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.040000" DistinctValues="39.960000">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzk2Mg==" LintValue="921322340"/>
<dxl:UpperBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABzk5OQ==" LintValue="896213868"/>
</dxl:StatsBucket>
</dxl:DerivedColumnStats>
</dxl:DerivedRelationStats>
<dxl:DerivedRelationStats Rows="20.000000">
<dxl:DerivedColumnStats ColId="8" Width="4.000000" NullFreq="0.000000" NdvRemain="0.000000" FreqRemain="0.000000">
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABTE=" LintValue="161096236"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjEw" LintValue="881984300"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjEw" LintValue="881984300"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjEx" LintValue="881992492"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjEx" LintValue="881992492"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjEy" LintValue="882000684"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjEy" LintValue="882000684"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjEz" LintValue="882008876"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjEz" LintValue="882008876"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjE0" LintValue="882017068"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjE0" LintValue="882017068"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjE1" LintValue="882025260"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjE1" LintValue="882025260"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjE2" LintValue="882033452"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjE2" LintValue="882033452"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjE3" LintValue="882041644"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjE3" LintValue="882041644"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjE4" LintValue="882049836"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjE4" LintValue="882049836"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjE5" LintValue="882058028"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjE5" LintValue="882058028"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABTI=" LintValue="161104428"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABTI=" LintValue="161104428"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABjIw" LintValue="873595684"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABjIw" LintValue="873595684"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABTM=" LintValue="161112620"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABTM=" LintValue="161112620"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABTQ=" LintValue="161120812"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABTQ=" LintValue="161120812"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABTU=" LintValue="161129004"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABTU=" LintValue="161129004"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABTY=" LintValue="161137196"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABTY=" LintValue="161137196"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABTc=" LintValue="161145388"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABTc=" LintValue="161145388"/>
<dxl:UpperBound Closed="false" TypeMdid="0.25.1.0" Value="AAAABTg=" LintValue="161153580"/>
</dxl:StatsBucket>
<dxl:StatsBucket Frequency="0.052632" DistinctValues="1.052632">
<dxl:LowerBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABTg=" LintValue="161153580"/>
<dxl:UpperBound Closed="true" TypeMdid="0.25.1.0" Value="AAAABTk=" LintValue="161161772"/>
</dxl:StatsBucket>
</dxl:DerivedColumnStats>
</dxl:DerivedRelationStats>
</dxl:Statistics>
</dxl:DXLMessage>
<?xml version="1.0" encoding="UTF-8"?>
<dxl:DXLMessage xmlns:dxl="http://greenplum.com/dxl/2010/12/">
<dxl:Statistics>
<dxl:DerivedRelationStats Rows="20.020020" EmptyRelation="false">
<dxl:DerivedColumnStats ColId="0" Width="3.000000" NullFreq="0.000000" NdvRemain="20.000008" FreqRemain="1.000000"/>
<dxl:DerivedColumnStats ColId="8" Width="4.000000" NullFreq="0.000000" NdvRemain="20.000008" FreqRemain="1.000000"/>
</dxl:DerivedRelationStats>
</dxl:Statistics>
</dxl:DXLMessage>
......@@ -178,7 +178,7 @@ namespace gpnaucrates
// check if the cardinality estimation should be done only via NDVs
static
BOOL DoNDVBasedCardEstimation(const CHistogram *histogram);
BOOL NeedsNDVBasedCardEstimationForEq(const CHistogram *histogram);
BOOL IsHistogramForTextRelatedTypes() const;
......@@ -350,6 +350,7 @@ namespace gpnaucrates
// number of buckets
ULONG GetNumBuckets() const
{
GPOS_ASSERT(m_histogram_buckets != NULL);
return m_histogram_buckets->Size();
}
......@@ -365,6 +366,8 @@ namespace gpnaucrates
return m_is_well_defined;
}
BOOL ContainsOnlySingletonBuckets() const;
// is the column statistics missing in the database
BOOL IsColStatsMissing() const
{
......
......@@ -813,50 +813,55 @@ CBucket::MakeBucketIntersect
BOOL lower_new_is_closed = true;
BOOL upper_new_is_closed = true;
CDouble distance_new = 1.0;
if (!lower_new->Equals(upper_new))
CDouble ratio1(0.0);
CDouble ratio2(0.0);
// edge case
if (IsSingleton() && bucket->IsSingleton())
{
lower_new_is_closed = this->m_is_lower_closed;
upper_new_is_closed = this->m_is_upper_closed;
if (lower_new->Equals(bucket->GetLowerBound()))
ratio1 = CDouble(1.0);
ratio2 = CDouble(1.0);
}
else
{
CDouble distance_new = 1.0;
if (!lower_new->Equals(upper_new))
{
lower_new_is_closed = bucket->IsLowerClosed();
if (lower_new->Equals(this->GetLowerBound()))
lower_new_is_closed = this->m_is_lower_closed;
upper_new_is_closed = this->m_is_upper_closed;
if (lower_new->Equals(bucket->GetLowerBound()))
{
lower_new_is_closed = this->IsLowerClosed() && bucket->IsLowerClosed();
lower_new_is_closed = bucket->IsLowerClosed();
if (lower_new->Equals(this->GetLowerBound()))
{
lower_new_is_closed = this->IsLowerClosed() && bucket->IsLowerClosed();
}
}
}
if (upper_new->Equals(bucket->GetUpperBound()))
{
upper_new_is_closed = bucket->IsUpperClosed();
if (upper_new->Equals(this->GetUpperBound()))
if (upper_new->Equals(bucket->GetUpperBound()))
{
upper_new_is_closed = this->IsUpperClosed() && bucket->IsUpperClosed();
upper_new_is_closed = bucket->IsUpperClosed();
if (upper_new->Equals(this->GetUpperBound()))
{
upper_new_is_closed = this->IsUpperClosed() && bucket->IsUpperClosed();
}
}
}
distance_new = upper_new->Distance(lower_new);
}
// TODO: , May 1 2013, distance function for data types such as bpchar/varchar
// that require binary comparison
GPOS_ASSERT(distance_new <= Width());
GPOS_ASSERT(distance_new <= bucket->Width());
distance_new = upper_new->Distance(lower_new);
}
// assume the values are equally distributed in the old buckets, so allocate a
// proportional value of NDVs to the new bucket
CDouble ratio1 = distance_new / Width();
CDouble ratio2 = distance_new / bucket->Width();
// TODO: , May 1 2013, distance function for data types such as bpchar/varchar
// that require binary comparison
GPOS_ASSERT(distance_new <= Width());
GPOS_ASSERT(distance_new <= bucket->Width());
// edge case
if (IsSingleton() && bucket->IsSingleton())
{
ratio1 = CDouble(1.0);
ratio2 = CDouble(1.0);
// assume the values are equally distributed in the old buckets, so allocate a
// proportional value of NDVs to the new bucket
ratio1 = distance_new / Width();
ratio2 = distance_new / bucket->Width();
}
// we are assuming an equi-join, so the side with the fewest NDVs determines the
// NDV of the join, any values on one side that don't match the other side are
// discarded
......
......@@ -647,8 +647,11 @@ CHistogram::IsValid
return false;
}
if (!IsHistogramForTextRelatedTypes())
if (IsHistogramForTextRelatedTypes())
{
return m_histogram_buckets->Size() == 0 || this->ContainsOnlySingletonBuckets();
}
else {
for (ULONG bucket_index = 1; bucket_index < m_histogram_buckets->Size(); bucket_index++)
{
CBucket *bucket = (*m_histogram_buckets)[bucket_index];
......@@ -1144,6 +1147,21 @@ CHistogram::IsOpSupportedForFilter
}
}
BOOL
CHistogram::ContainsOnlySingletonBuckets() const
{
for (ULONG ul = 0; ul < m_histogram_buckets->Size(); ++ul)
{
CBucket *bucket = (*m_histogram_buckets)[ul];
if (!bucket->IsSingleton())
{
return false;
}
}
return true;
}
// is comparison type supported for join?
BOOL
CHistogram::JoinPredCmpTypeIsSupported
......@@ -1174,10 +1192,10 @@ CHistogram::MakeJoinHistogramEqualityFilter
CDouble distinct_remaining(0.0);
CDouble freq_remaining(0.0);
BOOL NDVBasedJoinCardEstimation1 = DoNDVBasedCardEstimation(this);
BOOL NDVBasedJoinCardEstimation2 = DoNDVBasedCardEstimation(histogram);
BOOL needs_ndv1 = NeedsNDVBasedCardEstimationForEq(this);
BOOL needs_ndv2 = NeedsNDVBasedCardEstimationForEq(histogram);
if (NDVBasedJoinCardEstimation1 || NDVBasedJoinCardEstimation2)
if (needs_ndv1 || needs_ndv2)
{
return MakeNDVBasedJoinHistogramEqualityFilter(histogram);
}
......@@ -2069,7 +2087,7 @@ CHistogram::MakeDefaultBoolHistogram
// check if the join cardinality estimation can be done based on NDV alone
BOOL
CHistogram::DoNDVBasedCardEstimation
CHistogram::NeedsNDVBasedCardEstimationForEq
(
const CHistogram *histogram
)
......@@ -2096,13 +2114,24 @@ CHistogram::DoNDVBasedCardEstimation
return false;
}
BOOL result = true;
if (datum->StatsMappable() && datum->IsDatumMappableToDouble())
if (datum->StatsMappable())
{
result = false;
if (datum->IsDatumMappableToDouble())
{
// int like type such as numeric
return false;
}
else if (datum->IsDatumMappableToLINT() && histogram->ContainsOnlySingletonBuckets())
{
// Types such as text should only produce histograms that contain only singleton buckets.
// The histograms cannot be used for range predicates but it is ok for equality predicates.
return false;
}
}
return result;
// For other cases, (e.g. certain non int types with non-singleton buckets),
// we are forced to use NDV based cardinality estimation.
return true;
}
// append given histograms to current object
......
......@@ -196,7 +196,6 @@ CJoinCardinalityTest::EresUnittest_Join()
{"../data/dxl/statistics/Join-Statistics-Input-Only-Nulls.xml", "../data/dxl/statistics/Join-Statistics-Output-Only-Nulls.xml", false, PdrgpstatspredjoinNullableCols},
{"../data/dxl/statistics/Join-Statistics-Input-Only-Nulls.xml", "../data/dxl/statistics/Join-Statistics-Output-LOJ-Only-Nulls.xml", true, PdrgpstatspredjoinNullableCols},
{"../data/dxl/statistics/Join-Statistics-DDistinct-Input.xml", "../data/dxl/statistics/Join-Statistics-DDistinct-Output.xml", false, PdrgpstatspredjoinSingleJoinPredicate},
{"../data/dxl/statistics/Join-Statistics-Text-Input.xml", "../data/dxl/statistics/Join-Statistics-Text-Output.xml", false, PdrgpstatspredjoinSingleJoinPredicate},
};
CColumnFactory *col_factory = COptCtxt::PoctxtFromTLS()->Pcf();
......
......@@ -41,7 +41,6 @@ const CHAR *rgszCTEFileNames[] =
"../data/dxl/minidump/CTE-9.mdp",
"../data/dxl/minidump/CTE-10.mdp",
"../data/dxl/minidump/CTE-11.mdp",
"../data/dxl/minidump/CTE-12.mdp",
"../data/dxl/minidump/CTE-with-random-filter.mdp",
"../data/dxl/minidump/CTE-volatile.mdp",
"../data/dxl/minidump/CTE-PartTbl.mdp",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册