提交 4a5c58a5 编写于 作者: B Bhuvnesh Chaudhary 提交者: Bhuvnesh

Fix NDVRemain and FreqRemain calculation

For text, varchar, char and bpchar, ORCA does not collect the
MCV and Histogram information, so the calculation of NDVRemain and
FreqRemain must be updated to account for it.

For such columns, NDVRemain is the stadistinct as available in the
pg_statistic, and FreqRemain is everything except the NULL frequency.

Earlier, NDVRemain and FreqRemain for such columns would yield 0
resulting in poor cardinality estimation and suboptimal  plans.
Signed-off-by: NEkta Khanna <ekhanna@pivotal.io>
上级 0a5756cc
......@@ -2477,10 +2477,11 @@ CTranslatorRelcacheToDXL::PimdobjColStats
CDouble dNDVBuckets(0.0);
CDouble dFreqBuckets(0.0);
CDouble dDistinctRemain(0.0);
CDouble dFreqRemain(0.0);
// We only want to create statistics buckets if the column is NOT a text, varchar, char or bpchar type
// For the above column types we will use NDVRemain and NullFreq to do cardinality estimation.
if (CTranslatorUtils::FCreateStatsBucket(oidAttType))
{
// transform all the bits and pieces from pg_statistic
......@@ -2511,18 +2512,23 @@ CTranslatorRelcacheToDXL::PimdobjColStats
CUtils::AddRefAppend(pdrgpdxlbucket, pdrgpdxlbucketTransformed);
pdrgpdxlbucketTransformed->Release();
}
// there will be remaining tuples if the merged histogram and the NULLS do not cover
// the total number of distinct values
CDouble dDistinctRemain(0.0);
CDouble dFreqRemain(0.0);
if ((1 - CStatistics::DEpsilon > dFreqBuckets + dNullFrequency) &&
(0 < dDistinct - dNDVBuckets - iNullNDV))
// there will be remaining tuples if the merged histogram and the NULLS do not cover
// the total number of distinct values
if ((1 - CStatistics::DEpsilon > dFreqBuckets + dNullFrequency) &&
(0 < dDistinct - dNDVBuckets - iNullNDV))
{
dDistinctRemain = std::max(CDouble(0.0), (dDistinct - dNDVBuckets - iNullNDV));
dFreqRemain = std::max(CDouble(0.0), (1 - dFreqBuckets - dNullFrequency));
}
}
else
{
dDistinctRemain = std::max(CDouble(0.0), (dDistinct - dNDVBuckets - iNullNDV));
dFreqRemain = std::max(CDouble(0.0), (1 - dFreqBuckets - dNullFrequency));
// in case of text, varchar, char or bpchar, there are no stats buckets, so the
// remaining frequency is everything excluding NULLs, and distinct remaining is the
// stadistinct as available in pg_statistic
dDistinctRemain = dDistinct;
dFreqRemain = 1 - dNullFrequency;
}
// free up allocated datum and float4 arrays
......
......@@ -447,3 +447,49 @@ SELECT * FROM test_broken_stats t1, good_tab t2 WHERE t1.b = t2.b;
(0 rows)
RESET allow_system_table_mods;
-- cardinality estimation for join on varchar, text, char and bpchar columns must account for FreqRemaining and NDVRemaining
-- resulting in better cardinality numbers for the joins in orca
-- start_ignore
DROP TABLE IF EXISTS test_join_card1;
NOTICE: table "test_join_card1" does not exist, skipping
DROP TABLE IF EXISTS test_join_card2;
NOTICE: table "test_join_card2" does not exist, skipping
-- end_ignore
CREATE TABLE test_join_card1 (a varchar, b varchar);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE test_join_card2 (a varchar, b varchar);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE test_join_card3 (a varchar, b varchar);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO test_join_card1 SELECT i::text, i::text FROM generate_series(1, 20000)i;
INSERT INTO test_join_card2 SELECT i::text, NULL FROM generate_series(1, 179)i;
INSERT INTO test_join_card2 SELECT 1::text, 'a' FROM generate_series(1, 5820)i;
INSERT INTO test_join_card3 SELECT i::text, i::text FROM generate_series(1,10000)i;
ANALYZE test_join_card1;
ANALYZE test_join_card2;
ANALYZE test_join_card3;
EXPLAIN SELECT * FROM test_join_card1 t1, test_join_card2 t2, test_join_card3 t3 WHERE t1.b = t2.b and t3.b = t2.b;
QUERY PLAN
------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice3; segments: 3) (cost=717.00..1479.01 rows=2910 width=22)
-> Hash Join (cost=717.00..1479.01 rows=970 width=22)
Hash Cond: t2.b::text = t1.b::text
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=240.00..907.44 rows=5820 width=12)
-> Hash Join (cost=240.00..674.64 rows=1940 width=12)
Hash Cond: t2.b::text = t3.b::text
-> Broadcast Motion 3:3 (slice1; segments: 3) (cost=0.00..308.95 rows=5999 width=4)
-> Seq Scan on test_join_card2 t2 (cost=0.00..68.99 rows=2000 width=4)
-> Hash (cost=115.00..115.00 rows=3334 width=8)
-> Seq Scan on test_join_card3 t3 (cost=0.00..115.00 rows=3334 width=8)
-> Hash (cost=227.00..227.00 rows=6667 width=10)
-> Seq Scan on test_join_card1 t1 (cost=0.00..227.00 rows=6667 width=10)
Optimizer: legacy query optimizer
(13 rows)
-- start_ignore
DROP TABLE IF EXISTS test_join_card1;
DROP TABLE IF EXISTS test_join_card2;
-- end_ignore
......@@ -447,3 +447,53 @@ SELECT * FROM test_broken_stats t1, good_tab t2 WHERE t1.b = t2.b;
(0 rows)
RESET allow_system_table_mods;
-- cardinality estimation for join on varchar, text, char and bpchar columns must account for FreqRemaining and NDVRemaining
-- resulting in better cardinality numbers for the joins in orca
-- start_ignore
DROP TABLE IF EXISTS test_join_card1;
NOTICE: table "test_join_card1" does not exist, skipping
DROP TABLE IF EXISTS test_join_card2;
NOTICE: table "test_join_card2" does not exist, skipping
-- end_ignore
CREATE TABLE test_join_card1 (a varchar, b varchar);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE test_join_card2 (a varchar, b varchar);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE test_join_card3 (a varchar, b varchar);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO test_join_card1 SELECT i::text, i::text FROM generate_series(1, 20000)i;
INSERT INTO test_join_card2 SELECT i::text, NULL FROM generate_series(1, 179)i;
INSERT INTO test_join_card2 SELECT 1::text, 'a' FROM generate_series(1, 5820)i;
INSERT INTO test_join_card3 SELECT i::text, i::text FROM generate_series(1,10000)i;
ANALYZE test_join_card1;
ANALYZE test_join_card2;
ANALYZE test_join_card3;
EXPLAIN SELECT * FROM test_join_card1 t1, test_join_card2 t2, test_join_card3 t3 WHERE t1.b = t2.b and t3.b = t2.b;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice4; segments: 3) (cost=0.00..1297.58 rows=5999 width=22)
-> Hash Join (cost=0.00..1297.09 rows=2000 width=22)
Hash Cond: test_join_card2.b::text = test_join_card3.b::text
-> Hash Join (cost=0.00..864.43 rows=2000 width=14)
Hash Cond: test_join_card1.b::text = test_join_card2.b::text
-> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.48 rows=6667 width=10)
Hash Key: test_join_card1.b::text
-> Table Scan on test_join_card1 (cost=0.00..431.15 rows=6667 width=10)
-> Hash (cost=431.08..431.08 rows=2000 width=4)
-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.08 rows=2000 width=4)
Hash Key: test_join_card2.b
-> Table Scan on test_join_card2 (cost=0.00..431.04 rows=2000 width=4)
-> Hash (cost=431.20..431.20 rows=3334 width=8)
-> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.20 rows=3334 width=8)
Hash Key: test_join_card3.b::text
-> Table Scan on test_join_card3 (cost=0.00..431.07 rows=3334 width=8)
Optimizer: PQO version 2.56.0
(17 rows)
-- start_ignore
DROP TABLE IF EXISTS test_join_card1;
DROP TABLE IF EXISTS test_join_card2;
-- end_ignore
......@@ -277,3 +277,25 @@ UPDATE pg_statistic SET stavalues1='{1,2,3}'::int[] WHERE starelid ='bfv_statist
SELECT * FROM test_broken_stats t1, good_tab t2 WHERE t1.b = t2.b;
RESET allow_system_table_mods;
-- cardinality estimation for join on varchar, text, char and bpchar columns must account for FreqRemaining and NDVRemaining
-- resulting in better cardinality numbers for the joins in orca
-- start_ignore
DROP TABLE IF EXISTS test_join_card1;
DROP TABLE IF EXISTS test_join_card2;
-- end_ignore
CREATE TABLE test_join_card1 (a varchar, b varchar);
CREATE TABLE test_join_card2 (a varchar, b varchar);
CREATE TABLE test_join_card3 (a varchar, b varchar);
INSERT INTO test_join_card1 SELECT i::text, i::text FROM generate_series(1, 20000)i;
INSERT INTO test_join_card2 SELECT i::text, NULL FROM generate_series(1, 179)i;
INSERT INTO test_join_card2 SELECT 1::text, 'a' FROM generate_series(1, 5820)i;
INSERT INTO test_join_card3 SELECT i::text, i::text FROM generate_series(1,10000)i;
ANALYZE test_join_card1;
ANALYZE test_join_card2;
ANALYZE test_join_card3;
EXPLAIN SELECT * FROM test_join_card1 t1, test_join_card2 t2, test_join_card3 t3 WHERE t1.b = t2.b and t3.b = t2.b;
-- start_ignore
DROP TABLE IF EXISTS test_join_card1;
DROP TABLE IF EXISTS test_join_card2;
-- end_ignore
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册